1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #pragma once
11 
12 #include <unordered_map>
13 #include <string>
14 #include <vector>
15 #include <atomic>
16 
17 #include "db/memtable_list.h"
18 #include "db/table_cache.h"
19 #include "db/table_properties_collector.h"
20 #include "db/write_batch_internal.h"
21 #include "db/write_controller.h"
22 #include "options/cf_options.h"
23 #include "rocksdb/compaction_job_stats.h"
24 #include "rocksdb/db.h"
25 #include "rocksdb/env.h"
26 #include "rocksdb/options.h"
27 #include "trace_replay/block_cache_tracer.h"
28 #include "util/thread_local.h"
29 
30 namespace ROCKSDB_NAMESPACE {
31 
32 class Version;
33 class VersionSet;
34 class VersionStorageInfo;
35 class MemTable;
36 class MemTableListVersion;
37 class CompactionPicker;
38 class Compaction;
39 class InternalKey;
40 class InternalStats;
41 class ColumnFamilyData;
42 class DBImpl;
43 class LogBuffer;
44 class InstrumentedMutex;
45 class InstrumentedMutexLock;
46 struct SuperVersionContext;
47 
48 extern const double kIncSlowdownRatio;
49 // This file contains a list of data structures for managing column family
50 // level metadata.
51 //
52 // The basic relationships among classes declared here are illustrated as
53 // following:
54 //
55 //       +----------------------+    +----------------------+   +--------+
56 //   +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 |   | DBImpl |
57 //   |   +----------------------+ |  +----------------------+   +----+---+
58 //   | +--------------------------+                                  |
59 //   | |                               +-----------------------------+
60 //   | |                               |
61 //   | | +-----------------------------v-------------------------------+
62 //   | | |                                                             |
63 //   | | |                      ColumnFamilySet                        |
64 //   | | |                                                             |
65 //   | | +-------------+--------------------------+----------------+---+
66 //   | |               |                          |                |
67 //   | +-------------------------------------+    |                |
68 //   |                 |                     |    |                v
69 //   |   +-------------v-------------+ +-----v----v---------+
70 //   |   |                           | |                    |
71 //   |   |     ColumnFamilyData 1    | | ColumnFamilyData 2 |    ......
72 //   |   |                           | |                    |
73 //   +--->                           | |                    |
74 //       |                 +---------+ |                    |
75 //       |                 | MemTable| |                    |
76 //       |                 |  List   | |                    |
77 //       +--------+---+--+-+----+----+ +--------------------++
78 //                |   |  |      |
79 //                |   |  |      |
80 //                |   |  |      +-----------------------+
81 //                |   |  +-----------+                  |
82 //                v   +--------+     |                  |
83 //       +--------+--------+   |     |                  |
84 //       |                 |   |     |       +----------v----------+
85 // +---> |SuperVersion 1.a +----------------->                     |
86 //       |                 +------+  |       | MemTableListVersion |
87 //       +---+-------------+   |  |  |       |                     |
88 //           |                 |  |  |       +----+------------+---+
89 //           |      current    |  |  |            |            |
90 //           |   +-------------+  |  |mem         |            |
91 //           |   |                |  |            |            |
92 //         +-v---v-------+    +---v--v---+  +-----v----+  +----v-----+
93 //         |             |    |          |  |          |  |          |
94 //         | Version 1.a |    | memtable |  | memtable |  | memtable |
95 //         |             |    |   1.a    |  |   1.b    |  |   1.c    |
96 //         +-------------+    |          |  |          |  |          |
97 //                            +----------+  +----------+  +----------+
98 //
99 // DBImpl keeps a ColumnFamilySet, which references to all column families by
100 // pointing to respective ColumnFamilyData object of each column family.
101 // This is how DBImpl can list and operate on all the column families.
102 // ColumnFamilyHandle also points to ColumnFamilyData directly, so that
103 // when a user executes a query, it can directly find memtables and Version
104 // as well as SuperVersion to the column family, without going through
105 // ColumnFamilySet.
106 //
107 // ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
108 // and SST files) indirectly, while ongoing operations may hold references
109 // to a current or an out-of-date SuperVersion, which in turn points to a
110 // point-in-time view of the LSM-tree. This guarantees the memtables and SST
111 // files being operated on will not go away, until the SuperVersion is
112 // unreferenced to 0 and destoryed.
113 //
114 // The following graph illustrates a possible referencing relationships:
115 //
116 // Column       +--------------+      current       +-----------+
117 // Family +---->+              +------------------->+           |
118 //  Data        | SuperVersion +----------+         | Version A |
119 //              |      3       |   imm    |         |           |
120 // Iter2 +----->+              |  +-------v------+  +-----------+
121 //              +-----+--------+  | MemtableList +----------------> Empty
122 //                    |           |   Version r  |  +-----------+
123 //                    |           +--------------+  |           |
124 //                    +------------------+   current| Version B |
125 //              +--------------+         |   +----->+           |
126 //              |              |         |   |      +-----+-----+
127 // Compaction +>+ SuperVersion +-------------+            ^
128 //    Job       |      2       +------+  |                |current
129 //              |              +----+ |  |     mem        |    +------------+
130 //              +--------------+    | |  +--------------------->            |
131 //                                  | +------------------------> MemTable a |
132 //                                  |          mem        |    |            |
133 //              +--------------+    |                     |    +------------+
134 //              |              +--------------------------+
135 //  Iter1 +-----> SuperVersion |    |                          +------------+
136 //              |      1       +------------------------------>+            |
137 //              |              +-+  |        mem               | MemTable b |
138 //              +--------------+ |  |                          |            |
139 //                               |  |    +--------------+      +-----^------+
140 //                               |  |imm | MemtableList |            |
141 //                               |  +--->+   Version s  +------------+
142 //                               |       +--------------+
143 //                               |       +--------------+
144 //                               |       | MemtableList |
145 //                               +------>+   Version t  +-------->  Empty
146 //                                 imm   +--------------+
147 //
148 // In this example, even if the current LSM-tree consists of Version A and
149 // memtable a, which is also referenced by SuperVersion, two older SuperVersion
150 // SuperVersion2 and Superversion1 still exist, and are referenced by a
151 // compaction job and an old iterator Iter1, respectively. SuperVersion2
152 // contains Version B, memtable a and memtable b; SuperVersion1 contains
153 // Version B and memtable b (mutable). As a result, Version B and memtable b
154 // are prevented from being destroyed or deleted.
155 
156 // ColumnFamilyHandleImpl is the class that clients use to access different
157 // column families. It has non-trivial destructor, which gets called when client
158 // is done using the column family
159 class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
160  public:
161   // create while holding the mutex
162   ColumnFamilyHandleImpl(
163       ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex);
164   // destroy without mutex
165   virtual ~ColumnFamilyHandleImpl();
cfd()166   virtual ColumnFamilyData* cfd() const { return cfd_; }
167 
168   virtual uint32_t GetID() const override;
169   virtual const std::string& GetName() const override;
170   virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override;
171   virtual const Comparator* GetComparator() const override;
172 
173  private:
174   ColumnFamilyData* cfd_;
175   DBImpl* db_;
176   InstrumentedMutex* mutex_;
177 };
178 
179 // Does not ref-count ColumnFamilyData
180 // We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
181 // calls DBImpl methods. When this happens, MemTableInserter need access to
182 // ColumnFamilyHandle (same as the client would need). In that case, we feed
183 // MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
184 // methods
185 class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
186  public:
ColumnFamilyHandleInternal()187   ColumnFamilyHandleInternal()
188       : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), internal_cfd_(nullptr) {}
189 
SetCFD(ColumnFamilyData * _cfd)190   void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
cfd()191   virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
192 
193  private:
194   ColumnFamilyData* internal_cfd_;
195 };
196 
197 // holds references to memtable, all immutable memtables and version
198 struct SuperVersion {
199   // Accessing members of this class is not thread-safe and requires external
200   // synchronization (ie db mutex held or on write thread).
201   ColumnFamilyData* cfd;
202   MemTable* mem;
203   MemTableListVersion* imm;
204   Version* current;
205   MutableCFOptions mutable_cf_options;
206   // Version number of the current SuperVersion
207   uint64_t version_number;
208   WriteStallCondition write_stall_condition;
209 
210   InstrumentedMutex* db_mutex;
211 
212   // should be called outside the mutex
213   SuperVersion() = default;
214   ~SuperVersion();
215   SuperVersion* Ref();
216   // If Unref() returns true, Cleanup() should be called with mutex held
217   // before deleting this SuperVersion.
218   bool Unref();
219 
220   // call these two methods with db mutex held
221   // Cleanup unrefs mem, imm and current. Also, it stores all memtables
222   // that needs to be deleted in to_delete vector. Unrefing those
223   // objects needs to be done in the mutex
224   void Cleanup();
225   void Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
226             MemTableListVersion* new_imm, Version* new_current);
227 
228   // The value of dummy is not actually used. kSVInUse takes its address as a
229   // mark in the thread local storage to indicate the SuperVersion is in use
230   // by thread. This way, the value of kSVInUse is guaranteed to have no
231   // conflict with SuperVersion object address and portable on different
232   // platform.
233   static int dummy;
234   static void* const kSVInUse;
235   static void* const kSVObsolete;
236 
237  private:
238   std::atomic<uint32_t> refs;
239   // We need to_delete because during Cleanup(), imm->Unref() returns
240   // all memtables that we need to free through this vector. We then
241   // delete all those memtables outside of mutex, during destruction
242   autovector<MemTable*> to_delete;
243 };
244 
245 extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
246 
247 extern Status CheckConcurrentWritesSupported(
248     const ColumnFamilyOptions& cf_options);
249 
250 extern Status CheckCFPathsSupported(const DBOptions& db_options,
251                                     const ColumnFamilyOptions& cf_options);
252 
253 extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
254                                            const ColumnFamilyOptions& src);
255 // Wrap user defined table proproties collector factories `from cf_options`
256 // into internal ones in int_tbl_prop_collector_factories. Add a system internal
257 // one too.
258 extern void GetIntTblPropCollectorFactory(
259     const ImmutableCFOptions& ioptions,
260     std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
261         int_tbl_prop_collector_factories);
262 
263 class ColumnFamilySet;
264 
265 // This class keeps all the data that a column family needs.
266 // Most methods require DB mutex held, unless otherwise noted
267 class ColumnFamilyData {
268  public:
269   ~ColumnFamilyData();
270 
271   // thread-safe
GetID()272   uint32_t GetID() const { return id_; }
273   // thread-safe
GetName()274   const std::string& GetName() const { return name_; }
275 
276   // Ref() can only be called from a context where the caller can guarantee
277   // that ColumnFamilyData is alive (while holding a non-zero ref already,
278   // holding a DB mutex, or as the leader in a write batch group).
Ref()279   void Ref() { refs_.fetch_add(1); }
280 
281   // Unref decreases the reference count, but does not handle deletion
282   // when the count goes to 0.  If this method returns true then the
283   // caller should delete the instance immediately, or later, by calling
284   // FreeDeadColumnFamilies().  Unref() can only be called while holding
285   // a DB mutex, or during single-threaded recovery.
Unref()286   bool Unref() {
287     int old_refs = refs_.fetch_sub(1);
288     assert(old_refs > 0);
289     return old_refs == 1;
290   }
291 
292   // UnrefAndTryDelete() decreases the reference count and do free if needed,
293   // return true if this is freed else false, UnrefAndTryDelete() can only
294   // be called while holding a DB mutex, or during single-threaded recovery.
295   bool UnrefAndTryDelete();
296 
297   // SetDropped() can only be called under following conditions:
298   // 1) Holding a DB mutex,
299   // 2) from single-threaded write thread, AND
300   // 3) from single-threaded VersionSet::LogAndApply()
301   // After dropping column family no other operation on that column family
302   // will be executed. All the files and memory will be, however, kept around
303   // until client drops the column family handle. That way, client can still
304   // access data from dropped column family.
305   // Column family can be dropped and still alive. In that state:
306   // *) Compaction and flush is not executed on the dropped column family.
307   // *) Client can continue reading from column family. Writes will fail unless
308   // WriteOptions::ignore_missing_column_families is true
309   // When the dropped column family is unreferenced, then we:
310   // *) Remove column family from the linked list maintained by ColumnFamilySet
311   // *) delete all memory associated with that column family
312   // *) delete all the files associated with that column family
313   void SetDropped();
IsDropped()314   bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); }
315 
316   // thread-safe
NumberLevels()317   int NumberLevels() const { return ioptions_.num_levels; }
318 
SetLogNumber(uint64_t log_number)319   void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
GetLogNumber()320   uint64_t GetLogNumber() const { return log_number_; }
321 
SetFlushReason(FlushReason flush_reason)322   void SetFlushReason(FlushReason flush_reason) {
323     flush_reason_ = flush_reason;
324   }
GetFlushReason()325   FlushReason GetFlushReason() const { return flush_reason_; }
326   // thread-safe
327   const FileOptions* soptions() const;
ioptions()328   const ImmutableCFOptions* ioptions() const { return &ioptions_; }
329   // REQUIRES: DB mutex held
330   // This returns the MutableCFOptions used by current SuperVersion
331   // You should use this API to reference MutableCFOptions most of the time.
GetCurrentMutableCFOptions()332   const MutableCFOptions* GetCurrentMutableCFOptions() const {
333     return &(super_version_->mutable_cf_options);
334   }
335   // REQUIRES: DB mutex held
336   // This returns the latest MutableCFOptions, which may be not in effect yet.
GetLatestMutableCFOptions()337   const MutableCFOptions* GetLatestMutableCFOptions() const {
338     return &mutable_cf_options_;
339   }
340 
341   // REQUIRES: DB mutex held
342   // Build ColumnFamiliesOptions with immutable options and latest mutable
343   // options.
344   ColumnFamilyOptions GetLatestCFOptions() const;
345 
is_delete_range_supported()346   bool is_delete_range_supported() { return is_delete_range_supported_; }
347 
348   // Validate CF options against DB options
349   static Status ValidateOptions(const DBOptions& db_options,
350                                 const ColumnFamilyOptions& cf_options);
351 #ifndef ROCKSDB_LITE
352   // REQUIRES: DB mutex held
353   Status SetOptions(
354       const DBOptions& db_options,
355       const std::unordered_map<std::string, std::string>& options_map);
356 #endif  // ROCKSDB_LITE
357 
internal_stats()358   InternalStats* internal_stats() { return internal_stats_.get(); }
359 
imm()360   MemTableList* imm() { return &imm_; }
mem()361   MemTable* mem() { return mem_; }
current()362   Version* current() { return current_; }
dummy_versions()363   Version* dummy_versions() { return dummy_versions_; }
364   void SetCurrent(Version* _current);
365   uint64_t GetNumLiveVersions() const;  // REQUIRE: DB mutex held
366   uint64_t GetTotalSstFilesSize() const;  // REQUIRE: DB mutex held
367   uint64_t GetLiveSstFilesSize() const;   // REQUIRE: DB mutex held
SetMemtable(MemTable * new_mem)368   void SetMemtable(MemTable* new_mem) {
369     uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
370     new_mem->SetID(memtable_id);
371     mem_ = new_mem;
372   }
373 
374   // calculate the oldest log needed for the durability of this column family
375   uint64_t OldestLogToKeep();
376 
377   // See Memtable constructor for explanation of earliest_seq param.
378   MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
379                                  SequenceNumber earliest_seq);
380   void CreateNewMemtable(const MutableCFOptions& mutable_cf_options,
381                          SequenceNumber earliest_seq);
382 
table_cache()383   TableCache* table_cache() const { return table_cache_.get(); }
384 
385   // See documentation in compaction_picker.h
386   // REQUIRES: DB mutex held
387   bool NeedsCompaction() const;
388   // REQUIRES: DB mutex held
389   Compaction* PickCompaction(const MutableCFOptions& mutable_options,
390                              LogBuffer* log_buffer);
391 
392   // Check if the passed range overlap with any running compactions.
393   // REQUIRES: DB mutex held
394   bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
395                                   const Slice& largest_user_key,
396                                   int level) const;
397 
398   // Check if the passed ranges overlap with any unflushed memtables
399   // (immutable or mutable).
400   //
401   // @param super_version A referenced SuperVersion that will be held for the
402   //    duration of this function.
403   //
404   // Thread-safe
405   Status RangesOverlapWithMemtables(const autovector<Range>& ranges,
406                                     SuperVersion* super_version, bool* overlap);
407 
408   // A flag to tell a manual compaction is to compact all levels together
409   // instead of a specific level.
410   static const int kCompactAllLevels;
411   // A flag to tell a manual compaction's output is base level.
412   static const int kCompactToBaseLevel;
413   // REQUIRES: DB mutex held
414   Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
415                            int input_level, int output_level,
416                            const CompactRangeOptions& compact_range_options,
417                            const InternalKey* begin, const InternalKey* end,
418                            InternalKey** compaction_end, bool* manual_conflict,
419                            uint64_t max_file_num_to_ignore);
420 
compaction_picker()421   CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
422   // thread-safe
user_comparator()423   const Comparator* user_comparator() const {
424     return internal_comparator_.user_comparator();
425   }
426   // thread-safe
internal_comparator()427   const InternalKeyComparator& internal_comparator() const {
428     return internal_comparator_;
429   }
430 
431   const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories()432   int_tbl_prop_collector_factories() const {
433     return &int_tbl_prop_collector_factories_;
434   }
435 
GetSuperVersion()436   SuperVersion* GetSuperVersion() { return super_version_; }
437   // thread-safe
438   // Return a already referenced SuperVersion to be used safely.
439   SuperVersion* GetReferencedSuperVersion(DBImpl* db);
440   // thread-safe
441   // Get SuperVersion stored in thread local storage. If it does not exist,
442   // get a reference from a current SuperVersion.
443   SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
444   // Try to return SuperVersion back to thread local storage. Retrun true on
445   // success and false on failure. It fails when the thread local storage
446   // contains anything other than SuperVersion::kSVInUse flag.
447   bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
448   // thread-safe
GetSuperVersionNumber()449   uint64_t GetSuperVersionNumber() const {
450     return super_version_number_.load();
451   }
452   // will return a pointer to SuperVersion* if previous SuperVersion
453   // if its reference count is zero and needs deletion or nullptr if not
454   // As argument takes a pointer to allocated SuperVersion to enable
455   // the clients to allocate SuperVersion outside of mutex.
456   // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
457   void InstallSuperVersion(SuperVersionContext* sv_context,
458                            InstrumentedMutex* db_mutex,
459                            const MutableCFOptions& mutable_cf_options);
460   void InstallSuperVersion(SuperVersionContext* sv_context,
461                            InstrumentedMutex* db_mutex);
462 
463   void ResetThreadLocalSuperVersions();
464 
465   // Protected by DB mutex
set_queued_for_flush(bool value)466   void set_queued_for_flush(bool value) { queued_for_flush_ = value; }
set_queued_for_compaction(bool value)467   void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; }
queued_for_flush()468   bool queued_for_flush() { return queued_for_flush_; }
queued_for_compaction()469   bool queued_for_compaction() { return queued_for_compaction_; }
470 
471   enum class WriteStallCause {
472     kNone,
473     kMemtableLimit,
474     kL0FileCountLimit,
475     kPendingCompactionBytes,
476   };
477   static std::pair<WriteStallCondition, WriteStallCause>
478   GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files,
479                                  uint64_t num_compaction_needed_bytes,
480                                  const MutableCFOptions& mutable_cf_options);
481 
482   // Recalculate some small conditions, which are changed only during
483   // compaction, adding new memtable and/or
484   // recalculation of compaction score. These values are used in
485   // DBImpl::MakeRoomForWrite function to decide, if it need to make
486   // a write stall
487   WriteStallCondition RecalculateWriteStallConditions(
488       const MutableCFOptions& mutable_cf_options);
489 
set_initialized()490   void set_initialized() { initialized_.store(true); }
491 
initialized()492   bool initialized() const { return initialized_.load(); }
493 
initial_cf_options()494   const ColumnFamilyOptions& initial_cf_options() {
495     return initial_cf_options_;
496   }
497 
498   Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
499 
500   // created_dirs remembers directory created, so that we don't need to call
501   // the same data creation operation again.
502   Status AddDirectories(
503       std::map<std::string, std::shared_ptr<Directory>>* created_dirs);
504 
505   Directory* GetDataDir(size_t path_id) const;
506 
TEST_GetLocalSV()507   ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
508 
509  private:
510   friend class ColumnFamilySet;
511   ColumnFamilyData(uint32_t id, const std::string& name,
512                    Version* dummy_versions, Cache* table_cache,
513                    WriteBufferManager* write_buffer_manager,
514                    const ColumnFamilyOptions& options,
515                    const ImmutableDBOptions& db_options,
516                    const FileOptions& file_options,
517                    ColumnFamilySet* column_family_set,
518                    BlockCacheTracer* const block_cache_tracer);
519 
520   uint32_t id_;
521   const std::string name_;
522   Version* dummy_versions_;  // Head of circular doubly-linked list of versions.
523   Version* current_;         // == dummy_versions->prev_
524 
525   std::atomic<int> refs_;      // outstanding references to ColumnFamilyData
526   std::atomic<bool> initialized_;
527   std::atomic<bool> dropped_;  // true if client dropped it
528 
529   const InternalKeyComparator internal_comparator_;
530   std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
531       int_tbl_prop_collector_factories_;
532 
533   const ColumnFamilyOptions initial_cf_options_;
534   const ImmutableCFOptions ioptions_;
535   MutableCFOptions mutable_cf_options_;
536 
537   const bool is_delete_range_supported_;
538 
539   std::unique_ptr<TableCache> table_cache_;
540 
541   std::unique_ptr<InternalStats> internal_stats_;
542 
543   WriteBufferManager* write_buffer_manager_;
544 
545   MemTable* mem_;
546   MemTableList imm_;
547   SuperVersion* super_version_;
548 
549   // An ordinal representing the current SuperVersion. Updated by
550   // InstallSuperVersion(), i.e. incremented every time super_version_
551   // changes.
552   std::atomic<uint64_t> super_version_number_;
553 
554   // Thread's local copy of SuperVersion pointer
555   // This needs to be destructed before mutex_
556   std::unique_ptr<ThreadLocalPtr> local_sv_;
557 
558   // pointers for a circular linked list. we use it to support iterations over
559   // all column families that are alive (note: dropped column families can also
560   // be alive as long as client holds a reference)
561   ColumnFamilyData* next_;
562   ColumnFamilyData* prev_;
563 
564   // This is the earliest log file number that contains data from this
565   // Column Family. All earlier log files must be ignored and not
566   // recovered from
567   uint64_t log_number_;
568 
569   std::atomic<FlushReason> flush_reason_;
570 
571   // An object that keeps all the compaction stats
572   // and picks the next compaction
573   std::unique_ptr<CompactionPicker> compaction_picker_;
574 
575   ColumnFamilySet* column_family_set_;
576 
577   std::unique_ptr<WriteControllerToken> write_controller_token_;
578 
579   // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
580   bool queued_for_flush_;
581 
582   // If true --> this ColumnFamily is currently present in
583   // DBImpl::compaction_queue_
584   bool queued_for_compaction_;
585 
586   uint64_t prev_compaction_needed_bytes_;
587 
588   // if the database was opened with 2pc enabled
589   bool allow_2pc_;
590 
591   // Memtable id to track flush.
592   std::atomic<uint64_t> last_memtable_id_;
593 
594   // Directories corresponding to cf_paths.
595   std::vector<std::shared_ptr<Directory>> data_dirs_;
596 };
597 
598 // ColumnFamilySet has interesting thread-safety requirements
599 // * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
600 // mutex AND executed in the write thread.
601 // CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
602 // single-threaded write thread. It is also called during Recovery and in
603 // DumpManifest().
604 // RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
605 // held and it needs to be executed from the write thread. SetDropped() also
606 // guarantees that it will be called only from single-threaded LogAndApply(),
607 // but this condition is not that important.
608 // * Iteration -- hold DB mutex, but you can release it in the body of
609 // iteration. If you release DB mutex in body, reference the column
610 // family before the mutex and unreference after you unlock, since the column
611 // family might get dropped when the DB mutex is released
612 // * GetDefault() -- thread safe
613 // * GetColumnFamily() -- either inside of DB mutex or from a write thread
614 // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
615 // NumberOfColumnFamilies -- inside of DB mutex
616 class ColumnFamilySet {
617  public:
618   // ColumnFamilySet supports iteration
619   class iterator {
620    public:
iterator(ColumnFamilyData * cfd)621     explicit iterator(ColumnFamilyData* cfd)
622         : current_(cfd) {}
623     iterator& operator++() {
624       // dropped column families might still be included in this iteration
625       // (we're only removing them when client drops the last reference to the
626       // column family).
627       // dummy is never dead, so this will never be infinite
628       do {
629         current_ = current_->next_;
630       } while (current_->refs_.load(std::memory_order_relaxed) == 0);
631       return *this;
632     }
633     bool operator!=(const iterator& other) {
634       return this->current_ != other.current_;
635     }
636     ColumnFamilyData* operator*() { return current_; }
637 
638    private:
639     ColumnFamilyData* current_;
640   };
641 
642   ColumnFamilySet(const std::string& dbname,
643                   const ImmutableDBOptions* db_options,
644                   const FileOptions& file_options, Cache* table_cache,
645                   WriteBufferManager* write_buffer_manager,
646                   WriteController* write_controller,
647                   BlockCacheTracer* const block_cache_tracer);
648   ~ColumnFamilySet();
649 
650   ColumnFamilyData* GetDefault() const;
651   // GetColumnFamily() calls return nullptr if column family is not found
652   ColumnFamilyData* GetColumnFamily(uint32_t id) const;
653   ColumnFamilyData* GetColumnFamily(const std::string& name) const;
654   // this call will return the next available column family ID. it guarantees
655   // that there is no column family with id greater than or equal to the
656   // returned value in the current running instance or anytime in RocksDB
657   // instance history.
658   uint32_t GetNextColumnFamilyID();
659   uint32_t GetMaxColumnFamily();
660   void UpdateMaxColumnFamily(uint32_t new_max_column_family);
661   size_t NumberOfColumnFamilies() const;
662 
663   ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
664                                        Version* dummy_version,
665                                        const ColumnFamilyOptions& options);
666 
begin()667   iterator begin() { return iterator(dummy_cfd_->next_); }
end()668   iterator end() { return iterator(dummy_cfd_); }
669 
670   // REQUIRES: DB mutex held
671   // Don't call while iterating over ColumnFamilySet
672   void FreeDeadColumnFamilies();
673 
get_table_cache()674   Cache* get_table_cache() { return table_cache_; }
675 
676  private:
677   friend class ColumnFamilyData;
678   // helper function that gets called from cfd destructor
679   // REQUIRES: DB mutex held
680   void RemoveColumnFamily(ColumnFamilyData* cfd);
681 
682   // column_families_ and column_family_data_ need to be protected:
683   // * when mutating both conditions have to be satisfied:
684   // 1. DB mutex locked
685   // 2. thread currently in single-threaded write thread
686   // * when reading, at least one condition needs to be satisfied:
687   // 1. DB mutex locked
688   // 2. accessed from a single-threaded write thread
689   std::unordered_map<std::string, uint32_t> column_families_;
690   std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
691 
692   uint32_t max_column_family_;
693   ColumnFamilyData* dummy_cfd_;
694   // We don't hold the refcount here, since default column family always exists
695   // We are also not responsible for cleaning up default_cfd_cache_. This is
696   // just a cache that makes common case (accessing default column family)
697   // faster
698   ColumnFamilyData* default_cfd_cache_;
699 
700   const std::string db_name_;
701   const ImmutableDBOptions* const db_options_;
702   const FileOptions file_options_;
703   Cache* table_cache_;
704   WriteBufferManager* write_buffer_manager_;
705   WriteController* write_controller_;
706   BlockCacheTracer* const block_cache_tracer_;
707 };
708 
709 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
710 // memtables of different column families (specified by ID in the write batch)
711 class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
712  public:
ColumnFamilyMemTablesImpl(ColumnFamilySet * column_family_set)713   explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
714       : column_family_set_(column_family_set), current_(nullptr) {}
715 
716   // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
717   // with the arguments used to construct *orig.
ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl * orig)718   explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig)
719       : column_family_set_(orig->column_family_set_), current_(nullptr) {}
720 
721   // sets current_ to ColumnFamilyData with column_family_id
722   // returns false if column family doesn't exist
723   // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
724   //           under a DB mutex OR from a write thread
725   bool Seek(uint32_t column_family_id) override;
726 
727   // Returns log number of the selected column family
728   // REQUIRES: under a DB mutex OR from a write thread
729   uint64_t GetLogNumber() const override;
730 
731   // REQUIRES: Seek() called first
732   // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
733   //           under a DB mutex OR from a write thread
734   virtual MemTable* GetMemTable() const override;
735 
736   // Returns column family handle for the selected column family
737   // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
738   //           under a DB mutex OR from a write thread
739   virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
740 
741   // Cannot be called while another thread is calling Seek().
742   // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
743   //           under a DB mutex OR from a write thread
current()744   virtual ColumnFamilyData* current() override { return current_; }
745 
746  private:
747   ColumnFamilySet* column_family_set_;
748   ColumnFamilyData* current_;
749   ColumnFamilyHandleInternal handle_;
750 };
751 
752 extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
753 
754 extern const Comparator* GetColumnFamilyUserComparator(
755     ColumnFamilyHandle* column_family);
756 
757 }  // namespace ROCKSDB_NAMESPACE
758