1 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE file. See the AUTHORS file for names of contributors.
8 
9 #pragma once
10 
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <map>
14 #include <memory>
15 #include <string>
16 #include <unordered_map>
17 #include <vector>
18 #include "rocksdb/iterator.h"
19 #include "rocksdb/listener.h"
20 #include "rocksdb/metadata.h"
21 #include "rocksdb/options.h"
22 #include "rocksdb/snapshot.h"
23 #include "rocksdb/sst_file_writer.h"
24 #include "rocksdb/thread_status.h"
25 #include "rocksdb/transaction_log.h"
26 #include "rocksdb/types.h"
27 #include "rocksdb/version.h"
28 
29 #ifdef _WIN32
30 // Windows API macro interference
31 #undef DeleteFile
32 #endif
33 
34 #if defined(__GNUC__) || defined(__clang__)
35 #define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
36 #elif _WIN32
37 #define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
38 #endif
39 
40 namespace ROCKSDB_NAMESPACE {
41 
42 struct Options;
43 struct DBOptions;
44 struct ColumnFamilyOptions;
45 struct ReadOptions;
46 struct WriteOptions;
47 struct FlushOptions;
48 struct CompactionOptions;
49 struct CompactRangeOptions;
50 struct TableProperties;
51 struct ExternalSstFileInfo;
52 class WriteBatch;
53 class Env;
54 class EventListener;
55 class StatsHistoryIterator;
56 class TraceWriter;
57 #ifdef ROCKSDB_LITE
58 class CompactionJobInfo;
59 #endif
60 class FileSystem;
61 
62 extern const std::string kDefaultColumnFamilyName;
63 extern const std::string kPersistentStatsColumnFamilyName;
64 struct ColumnFamilyDescriptor {
65   std::string name;
66   ColumnFamilyOptions options;
ColumnFamilyDescriptorColumnFamilyDescriptor67   ColumnFamilyDescriptor()
68       : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
ColumnFamilyDescriptorColumnFamilyDescriptor69   ColumnFamilyDescriptor(const std::string& _name,
70                          const ColumnFamilyOptions& _options)
71       : name(_name), options(_options) {}
72 };
73 
74 class ColumnFamilyHandle {
75  public:
~ColumnFamilyHandle()76   virtual ~ColumnFamilyHandle() {}
77   // Returns the name of the column family associated with the current handle.
78   virtual const std::string& GetName() const = 0;
79   // Returns the ID of the column family associated with the current handle.
80   virtual uint32_t GetID() const = 0;
81   // Fills "*desc" with the up-to-date descriptor of the column family
82   // associated with this handle. Since it fills "*desc" with the up-to-date
83   // information, this call might internally lock and release DB mutex to
84   // access the up-to-date CF options.  In addition, all the pointer-typed
85   // options cannot be referenced any longer than the original options exist.
86   //
87   // Note that this function is not supported in RocksDBLite.
88   virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0;
89   // Returns the comparator of the column family associated with the
90   // current handle.
91   virtual const Comparator* GetComparator() const = 0;
92 };
93 
94 static const int kMajorVersion = __ROCKSDB_MAJOR__;
95 static const int kMinorVersion = __ROCKSDB_MINOR__;
96 
97 // A range of keys
98 struct Range {
99   Slice start;
100   Slice limit;
101 
RangeRange102   Range() {}
RangeRange103   Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
104 };
105 
106 struct RangePtr {
107   const Slice* start;
108   const Slice* limit;
109 
RangePtrRangePtr110   RangePtr() : start(nullptr), limit(nullptr) {}
RangePtrRangePtr111   RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
112 };
113 
114 struct IngestExternalFileArg {
115   ColumnFamilyHandle* column_family = nullptr;
116   std::vector<std::string> external_files;
117   IngestExternalFileOptions options;
118 };
119 
120 struct GetMergeOperandsOptions {
121   int expected_max_number_of_operands = 0;
122 };
123 
124 // A collections of table properties objects, where
125 //  key: is the table's file name.
126 //  value: the table properties object of the given table.
127 typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
128     TablePropertiesCollection;
129 
130 // A DB is a persistent ordered map from keys to values.
131 // A DB is safe for concurrent access from multiple threads without
132 // any external synchronization.
133 class DB {
134  public:
135   // Open the database with the specified "name".
136   // Stores a pointer to a heap-allocated database in *dbptr and returns
137   // OK on success.
138   // Stores nullptr in *dbptr and returns a non-OK status on error.
139   // Caller should delete *dbptr when it is no longer needed.
140   static Status Open(const Options& options, const std::string& name,
141                      DB** dbptr);
142 
143   // Open the database for read only. All DB interfaces
144   // that modify data, like put/delete, will return error.
145   // If the db is opened in read only mode, then no compactions
146   // will happen.
147   //
148   // Not supported in ROCKSDB_LITE, in which case the function will
149   // return Status::NotSupported.
150   static Status OpenForReadOnly(const Options& options, const std::string& name,
151                                 DB** dbptr,
152                                 bool error_if_log_file_exist = false);
153 
154   // Open the database for read only with column families. When opening DB with
155   // read only, you can specify only a subset of column families in the
156   // database that should be opened. However, you always need to specify default
157   // column family. The default column family name is 'default' and it's stored
158   // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
159   //
160   // Not supported in ROCKSDB_LITE, in which case the function will
161   // return Status::NotSupported.
162   static Status OpenForReadOnly(
163       const DBOptions& db_options, const std::string& name,
164       const std::vector<ColumnFamilyDescriptor>& column_families,
165       std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
166       bool error_if_log_file_exist = false);
167 
168   // The following OpenAsSecondary functions create a secondary instance that
169   // can dynamically tail the MANIFEST of a primary that must have already been
170   // created. User can call TryCatchUpWithPrimary to make the secondary
171   // instance catch up with primary (WAL tailing is NOT supported now) whenever
172   // the user feels necessary. Column families created by the primary after the
173   // secondary instance starts are currently ignored by the secondary instance.
174   // Column families opened by secondary and dropped by the primary will be
175   // dropped by secondary as well. However the user of the secondary instance
176   // can still access the data of such dropped column family as long as they
177   // do not destroy the corresponding column family handle.
178   // WAL tailing is not supported at present, but will arrive soon.
179   //
180   // The options argument specifies the options to open the secondary instance.
181   // The name argument specifies the name of the primary db that you have used
182   // to open the primary instance.
183   // The secondary_path argument points to a directory where the secondary
184   // instance stores its info log.
185   // The dbptr is an out-arg corresponding to the opened secondary instance.
186   // The pointer points to a heap-allocated database, and the user should
187   // delete it after use.
188   // Open DB as secondary instance with only the default column family.
189   // Return OK on success, non-OK on failures.
190   static Status OpenAsSecondary(const Options& options, const std::string& name,
191                                 const std::string& secondary_path, DB** dbptr);
192 
193   // Open DB as secondary instance with column families. You can open a subset
194   // of column families in secondary mode.
195   // The db_options specify the database specific options.
196   // The name argument specifies the name of the primary db that you have used
197   // to open the primary instance.
198   // The secondary_path argument points to a directory where the secondary
199   // instance stores its info log.
200   // The column_families argument specifieds a list of column families to open.
201   // If any of the column families does not exist, the function returns non-OK
202   // status.
203   // The handles is an out-arg corresponding to the opened database column
204   // familiy handles.
205   // The dbptr is an out-arg corresponding to the opened secondary instance.
206   // The pointer points to a heap-allocated database, and the caller should
207   // delete it after use. Before deleting the dbptr, the user should also
208   // delete the pointers stored in handles vector.
209   // Return OK on success, on-OK on failures.
210   static Status OpenAsSecondary(
211       const DBOptions& db_options, const std::string& name,
212       const std::string& secondary_path,
213       const std::vector<ColumnFamilyDescriptor>& column_families,
214       std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
215 
216   // Open DB with column families.
217   // db_options specify database specific options
218   // column_families is the vector of all column families in the database,
219   // containing column family name and options. You need to open ALL column
220   // families in the database. To get the list of column families, you can use
221   // ListColumnFamilies(). Also, you can open only a subset of column families
222   // for read-only access.
223   // The default column family name is 'default' and it's stored
224   // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName.
225   // If everything is OK, handles will on return be the same size
226   // as column_families --- handles[i] will be a handle that you
227   // will use to operate on column family column_family[i].
228   // Before delete DB, you have to close All column families by calling
229   // DestroyColumnFamilyHandle() with all the handles.
230   static Status Open(const DBOptions& db_options, const std::string& name,
231                      const std::vector<ColumnFamilyDescriptor>& column_families,
232                      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
233 
Resume()234   virtual Status Resume() { return Status::NotSupported(); }
235 
236   // Close the DB by releasing resources, closing files etc. This should be
237   // called before calling the destructor so that the caller can get back a
238   // status in case there are any errors. This will not fsync the WAL files.
239   // If syncing is required, the caller must first call SyncWAL(), or Write()
240   // using an empty write batch with WriteOptions.sync=true.
241   // Regardless of the return status, the DB must be freed.
242   // If the return status is Aborted(), closing fails because there is
243   // unreleased snapshot in the system. In this case, users can release
244   // the unreleased snapshots and try again and expect it to succeed. For
245   // other status, recalling Close() will be no-op.
246   // If the return status is NotSupported(), then the DB implementation does
247   // cleanup in the destructor
Close()248   virtual Status Close() { return Status::NotSupported(); }
249 
250   // ListColumnFamilies will open the DB specified by argument name
251   // and return the list of all column families in that DB
252   // through column_families argument. The ordering of
253   // column families in column_families is unspecified.
254   static Status ListColumnFamilies(const DBOptions& db_options,
255                                    const std::string& name,
256                                    std::vector<std::string>* column_families);
257 
DB()258   DB() {}
259   // No copying allowed
260   DB(const DB&) = delete;
261   void operator=(const DB&) = delete;
262 
263   virtual ~DB();
264 
265   // Create a column_family and return the handle of column family
266   // through the argument handle.
267   virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
268                                     const std::string& column_family_name,
269                                     ColumnFamilyHandle** handle);
270 
271   // Bulk create column families with the same column family options.
272   // Return the handles of the column families through the argument handles.
273   // In case of error, the request may succeed partially, and handles will
274   // contain column family handles that it managed to create, and have size
275   // equal to the number of created column families.
276   virtual Status CreateColumnFamilies(
277       const ColumnFamilyOptions& options,
278       const std::vector<std::string>& column_family_names,
279       std::vector<ColumnFamilyHandle*>* handles);
280 
281   // Bulk create column families.
282   // Return the handles of the column families through the argument handles.
283   // In case of error, the request may succeed partially, and handles will
284   // contain column family handles that it managed to create, and have size
285   // equal to the number of created column families.
286   virtual Status CreateColumnFamilies(
287       const std::vector<ColumnFamilyDescriptor>& column_families,
288       std::vector<ColumnFamilyHandle*>* handles);
289 
290   // Drop a column family specified by column_family handle. This call
291   // only records a drop record in the manifest and prevents the column
292   // family from flushing and compacting.
293   virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
294 
295   // Bulk drop column families. This call only records drop records in the
296   // manifest and prevents the column families from flushing and compacting.
297   // In case of error, the request may succeed partially. User may call
298   // ListColumnFamilies to check the result.
299   virtual Status DropColumnFamilies(
300       const std::vector<ColumnFamilyHandle*>& column_families);
301 
302   // Close a column family specified by column_family handle and destroy
303   // the column family handle specified to avoid double deletion. This call
304   // deletes the column family handle by default. Use this method to
305   // close column family instead of deleting column family handle directly
306   virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family);
307 
308   // Set the database entry for "key" to "value".
309   // If "key" already exists, it will be overwritten.
310   // Returns OK on success, and a non-OK status on error.
311   // Note: consider setting options.sync = true.
312   virtual Status Put(const WriteOptions& options,
313                      ColumnFamilyHandle* column_family, const Slice& key,
314                      const Slice& value) = 0;
Put(const WriteOptions & options,const Slice & key,const Slice & value)315   virtual Status Put(const WriteOptions& options, const Slice& key,
316                      const Slice& value) {
317     return Put(options, DefaultColumnFamily(), key, value);
318   }
319 
320   // Remove the database entry (if any) for "key".  Returns OK on
321   // success, and a non-OK status on error.  It is not an error if "key"
322   // did not exist in the database.
323   // Note: consider setting options.sync = true.
324   virtual Status Delete(const WriteOptions& options,
325                         ColumnFamilyHandle* column_family,
326                         const Slice& key) = 0;
Delete(const WriteOptions & options,const Slice & key)327   virtual Status Delete(const WriteOptions& options, const Slice& key) {
328     return Delete(options, DefaultColumnFamily(), key);
329   }
330 
331   // Remove the database entry for "key". Requires that the key exists
332   // and was not overwritten. Returns OK on success, and a non-OK status
333   // on error.  It is not an error if "key" did not exist in the database.
334   //
335   // If a key is overwritten (by calling Put() multiple times), then the result
336   // of calling SingleDelete() on this key is undefined.  SingleDelete() only
337   // behaves correctly if there has been only one Put() for this key since the
338   // previous call to SingleDelete() for this key.
339   //
340   // This feature is currently an experimental performance optimization
341   // for a very specific workload.  It is up to the caller to ensure that
342   // SingleDelete is only used for a key that is not deleted using Delete() or
343   // written using Merge().  Mixing SingleDelete operations with Deletes and
344   // Merges can result in undefined behavior.
345   //
346   // Note: consider setting options.sync = true.
347   virtual Status SingleDelete(const WriteOptions& options,
348                               ColumnFamilyHandle* column_family,
349                               const Slice& key) = 0;
SingleDelete(const WriteOptions & options,const Slice & key)350   virtual Status SingleDelete(const WriteOptions& options, const Slice& key) {
351     return SingleDelete(options, DefaultColumnFamily(), key);
352   }
353 
354   // Removes the database entries in the range ["begin_key", "end_key"), i.e.,
355   // including "begin_key" and excluding "end_key". Returns OK on success, and
356   // a non-OK status on error. It is not an error if no keys exist in the range
357   // ["begin_key", "end_key").
358   //
359   // This feature is now usable in production, with the following caveats:
360   // 1) Accumulating many range tombstones in the memtable will degrade read
361   // performance; this can be avoided by manually flushing occasionally.
362   // 2) Limiting the maximum number of open files in the presence of range
363   // tombstones can degrade read performance. To avoid this problem, set
364   // max_open_files to -1 whenever possible.
365   virtual Status DeleteRange(const WriteOptions& options,
366                              ColumnFamilyHandle* column_family,
367                              const Slice& begin_key, const Slice& end_key);
368 
369   // Merge the database entry for "key" with "value".  Returns OK on success,
370   // and a non-OK status on error. The semantics of this operation is
371   // determined by the user provided merge_operator when opening DB.
372   // Note: consider setting options.sync = true.
373   virtual Status Merge(const WriteOptions& options,
374                        ColumnFamilyHandle* column_family, const Slice& key,
375                        const Slice& value) = 0;
Merge(const WriteOptions & options,const Slice & key,const Slice & value)376   virtual Status Merge(const WriteOptions& options, const Slice& key,
377                        const Slice& value) {
378     return Merge(options, DefaultColumnFamily(), key, value);
379   }
380 
381   // Apply the specified updates to the database.
382   // If `updates` contains no update, WAL will still be synced if
383   // options.sync=true.
384   // Returns OK on success, non-OK on failure.
385   // Note: consider setting options.sync = true.
386   virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
387 
388   // If the database contains an entry for "key" store the
389   // corresponding value in *value and return OK.
390   //
391   // If there is no entry for "key" leave *value unchanged and return
392   // a status for which Status::IsNotFound() returns true.
393   //
394   // May return some other Status on an error.
Get(const ReadOptions & options,ColumnFamilyHandle * column_family,const Slice & key,std::string * value)395   virtual inline Status Get(const ReadOptions& options,
396                             ColumnFamilyHandle* column_family, const Slice& key,
397                             std::string* value) {
398     assert(value != nullptr);
399     PinnableSlice pinnable_val(value);
400     assert(!pinnable_val.IsPinned());
401     auto s = Get(options, column_family, key, &pinnable_val);
402     if (s.ok() && pinnable_val.IsPinned()) {
403       value->assign(pinnable_val.data(), pinnable_val.size());
404     }  // else value is already assigned
405     return s;
406   }
407   virtual Status Get(const ReadOptions& options,
408                      ColumnFamilyHandle* column_family, const Slice& key,
409                      PinnableSlice* value) = 0;
Get(const ReadOptions & options,const Slice & key,std::string * value)410   virtual Status Get(const ReadOptions& options, const Slice& key,
411                      std::string* value) {
412     return Get(options, DefaultColumnFamily(), key, value);
413   }
414 
415   // Returns all the merge operands corresponding to the key. If the
416   // number of merge operands in DB is greater than
417   // merge_operands_options.expected_max_number_of_operands
418   // no merge operands are returned and status is Incomplete. Merge operands
419   // returned are in the order of insertion.
420   // merge_operands- Points to an array of at-least
421   //             merge_operands_options.expected_max_number_of_operands and the
422   //             caller is responsible for allocating it. If the status
423   //             returned is Incomplete then number_of_operands will contain
424   //             the total number of merge operands found in DB for key.
425   virtual Status GetMergeOperands(
426       const ReadOptions& options, ColumnFamilyHandle* column_family,
427       const Slice& key, PinnableSlice* merge_operands,
428       GetMergeOperandsOptions* get_merge_operands_options,
429       int* number_of_operands) = 0;
430 
431   // If keys[i] does not exist in the database, then the i'th returned
432   // status will be one for which Status::IsNotFound() is true, and
433   // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
434   // the i'th returned status will have Status::ok() true, and (*values)[i]
435   // will store the value associated with keys[i].
436   //
437   // (*values) will always be resized to be the same size as (keys).
438   // Similarly, the number of returned statuses will be the number of keys.
439   // Note: keys will not be "de-duplicated". Duplicate keys will return
440   // duplicate values in order.
441   virtual std::vector<Status> MultiGet(
442       const ReadOptions& options,
443       const std::vector<ColumnFamilyHandle*>& column_family,
444       const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
MultiGet(const ReadOptions & options,const std::vector<Slice> & keys,std::vector<std::string> * values)445   virtual std::vector<Status> MultiGet(const ReadOptions& options,
446                                        const std::vector<Slice>& keys,
447                                        std::vector<std::string>* values) {
448     return MultiGet(
449         options,
450         std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
451         keys, values);
452   }
453 
454   // Overloaded MultiGet API that improves performance by batching operations
455   // in the read path for greater efficiency. Currently, only the block based
456   // table format with full filters are supported. Other table formats such
457   // as plain table, block based table with block based filters and
458   // partitioned indexes will still work, but will not get any performance
459   // benefits.
460   // Parameters -
461   // options - ReadOptions
462   // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
463   //                 passed to the API are restricted to a single column family
464   // num_keys - Number of keys to lookup
465   // keys - Pointer to C style array of key Slices with num_keys elements
466   // values - Pointer to C style array of PinnableSlices with num_keys elements
467   // statuses - Pointer to C style array of Status with num_keys elements
468   // sorted_input - If true, it means the input keys are already sorted by key
469   //                order, so the MultiGet() API doesn't have to sort them
470   //                again. If false, the keys will be copied and sorted
471   //                internally by the API - the input array will not be
472   //                modified
473   virtual void MultiGet(const ReadOptions& options,
474                         ColumnFamilyHandle* column_family,
475                         const size_t num_keys, const Slice* keys,
476                         PinnableSlice* values, Status* statuses,
477                         const bool /*sorted_input*/ = false) {
478     std::vector<ColumnFamilyHandle*> cf;
479     std::vector<Slice> user_keys;
480     std::vector<Status> status;
481     std::vector<std::string> vals;
482 
483     for (size_t i = 0; i < num_keys; ++i) {
484       cf.emplace_back(column_family);
485       user_keys.emplace_back(keys[i]);
486     }
487     status = MultiGet(options, cf, user_keys, &vals);
488     std::copy(status.begin(), status.end(), statuses);
489     for (auto& value : vals) {
490       values->PinSelf(value);
491       values++;
492     }
493   }
494 
495   // Overloaded MultiGet API that improves performance by batching operations
496   // in the read path for greater efficiency. Currently, only the block based
497   // table format with full filters are supported. Other table formats such
498   // as plain table, block based table with block based filters and
499   // partitioned indexes will still work, but will not get any performance
500   // benefits.
501   // Parameters -
502   // options - ReadOptions
503   // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
504   //                 passed to the API are restricted to a single column family
505   // num_keys - Number of keys to lookup
506   // keys - Pointer to C style array of key Slices with num_keys elements
507   // values - Pointer to C style array of PinnableSlices with num_keys elements
508   // statuses - Pointer to C style array of Status with num_keys elements
509   // sorted_input - If true, it means the input keys are already sorted by key
510   //                order, so the MultiGet() API doesn't have to sort them
511   //                again. If false, the keys will be copied and sorted
512   //                internally by the API - the input array will not be
513   //                modified
514   virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
515                         ColumnFamilyHandle** column_families, const Slice* keys,
516                         PinnableSlice* values, Status* statuses,
517                         const bool /*sorted_input*/ = false) {
518     std::vector<ColumnFamilyHandle*> cf;
519     std::vector<Slice> user_keys;
520     std::vector<Status> status;
521     std::vector<std::string> vals;
522 
523     for (size_t i = 0; i < num_keys; ++i) {
524       cf.emplace_back(column_families[i]);
525       user_keys.emplace_back(keys[i]);
526     }
527     status = MultiGet(options, cf, user_keys, &vals);
528     std::copy(status.begin(), status.end(), statuses);
529     for (auto& value : vals) {
530       values->PinSelf(value);
531       values++;
532     }
533   }
534 
535   // If the key definitely does not exist in the database, then this method
536   // returns false, else true. If the caller wants to obtain value when the key
537   // is found in memory, a bool for 'value_found' must be passed. 'value_found'
538   // will be true on return if value has been set properly.
539   // This check is potentially lighter-weight than invoking DB::Get(). One way
540   // to make this lighter weight is to avoid doing any IOs.
541   // Default implementation here returns true and sets 'value_found' to false
542   virtual bool KeyMayExist(const ReadOptions& /*options*/,
543                            ColumnFamilyHandle* /*column_family*/,
544                            const Slice& /*key*/, std::string* /*value*/,
545                            bool* value_found = nullptr) {
546     if (value_found != nullptr) {
547       *value_found = false;
548     }
549     return true;
550   }
551   virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
552                            std::string* value, bool* value_found = nullptr) {
553     return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
554   }
555 
556   // Return a heap-allocated iterator over the contents of the database.
557   // The result of NewIterator() is initially invalid (caller must
558   // call one of the Seek methods on the iterator before using it).
559   //
560   // Caller should delete the iterator when it is no longer needed.
561   // The returned iterator should be deleted before this db is deleted.
562   virtual Iterator* NewIterator(const ReadOptions& options,
563                                 ColumnFamilyHandle* column_family) = 0;
NewIterator(const ReadOptions & options)564   virtual Iterator* NewIterator(const ReadOptions& options) {
565     return NewIterator(options, DefaultColumnFamily());
566   }
567   // Returns iterators from a consistent database state across multiple
568   // column families. Iterators are heap allocated and need to be deleted
569   // before the db is deleted
570   virtual Status NewIterators(
571       const ReadOptions& options,
572       const std::vector<ColumnFamilyHandle*>& column_families,
573       std::vector<Iterator*>* iterators) = 0;
574 
575   // Return a handle to the current DB state.  Iterators created with
576   // this handle will all observe a stable snapshot of the current DB
577   // state.  The caller must call ReleaseSnapshot(result) when the
578   // snapshot is no longer needed.
579   //
580   // nullptr will be returned if the DB fails to take a snapshot or does
581   // not support snapshot.
582   virtual const Snapshot* GetSnapshot() = 0;
583 
584   // Release a previously acquired snapshot.  The caller must not
585   // use "snapshot" after this call.
586   virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
587 
588 #ifndef ROCKSDB_LITE
589   // Contains all valid property arguments for GetProperty().
590   //
591   // NOTE: Property names cannot end in numbers since those are interpreted as
592   //       arguments, e.g., see kNumFilesAtLevelPrefix.
593   struct Properties {
594     //  "rocksdb.num-files-at-level<N>" - returns string containing the number
595     //      of files at level <N>, where <N> is an ASCII representation of a
596     //      level number (e.g., "0").
597     static const std::string kNumFilesAtLevelPrefix;
598 
599     //  "rocksdb.compression-ratio-at-level<N>" - returns string containing the
600     //      compression ratio of data at level <N>, where <N> is an ASCII
601     //      representation of a level number (e.g., "0"). Here, compression
602     //      ratio is defined as uncompressed data size / compressed file size.
603     //      Returns "-1.0" if no open files at level <N>.
604     static const std::string kCompressionRatioAtLevelPrefix;
605 
606     //  "rocksdb.stats" - returns a multi-line string containing the data
607     //      described by kCFStats followed by the data described by kDBStats.
608     static const std::string kStats;
609 
610     //  "rocksdb.sstables" - returns a multi-line string summarizing current
611     //      SST files.
612     static const std::string kSSTables;
613 
614     //  "rocksdb.cfstats" - Both of "rocksdb.cfstats-no-file-histogram" and
615     //      "rocksdb.cf-file-histogram" together. See below for description
616     //      of the two.
617     static const std::string kCFStats;
618 
619     //  "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with
620     //      general columm family stats per-level over db's lifetime ("L<n>"),
621     //      aggregated over db's lifetime ("Sum"), and aggregated over the
622     //      interval since the last retrieval ("Int").
623     //  It could also be used to return the stats in the format of the map.
624     //  In this case there will a pair of string to array of double for
625     //  each level as well as for "Sum". "Int" stats will not be affected
626     //  when this form of stats are retrieved.
627     static const std::string kCFStatsNoFileHistogram;
628 
629     //  "rocksdb.cf-file-histogram" - print out how many file reads to every
630     //      level, as well as the histogram of latency of single requests.
631     static const std::string kCFFileHistogram;
632 
633     //  "rocksdb.dbstats" - returns a multi-line string with general database
634     //      stats, both cumulative (over the db's lifetime) and interval (since
635     //      the last retrieval of kDBStats).
636     static const std::string kDBStats;
637 
638     //  "rocksdb.levelstats" - returns multi-line string containing the number
639     //      of files per level and total size of each level (MB).
640     static const std::string kLevelStats;
641 
642     //  "rocksdb.num-immutable-mem-table" - returns number of immutable
643     //      memtables that have not yet been flushed.
644     static const std::string kNumImmutableMemTable;
645 
646     //  "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable
647     //      memtables that have already been flushed.
648     static const std::string kNumImmutableMemTableFlushed;
649 
650     //  "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is
651     //      pending; otherwise, returns 0.
652     static const std::string kMemTableFlushPending;
653 
654     //  "rocksdb.num-running-flushes" - returns the number of currently running
655     //      flushes.
656     static const std::string kNumRunningFlushes;
657 
658     //  "rocksdb.compaction-pending" - returns 1 if at least one compaction is
659     //      pending; otherwise, returns 0.
660     static const std::string kCompactionPending;
661 
662     //  "rocksdb.num-running-compactions" - returns the number of currently
663     //      running compactions.
664     static const std::string kNumRunningCompactions;
665 
666     //  "rocksdb.background-errors" - returns accumulated number of background
667     //      errors.
668     static const std::string kBackgroundErrors;
669 
670     //  "rocksdb.cur-size-active-mem-table" - returns approximate size of active
671     //      memtable (bytes).
672     static const std::string kCurSizeActiveMemTable;
673 
674     //  "rocksdb.cur-size-all-mem-tables" - returns approximate size of active
675     //      and unflushed immutable memtables (bytes).
676     static const std::string kCurSizeAllMemTables;
677 
678     //  "rocksdb.size-all-mem-tables" - returns approximate size of active,
679     //      unflushed immutable, and pinned immutable memtables (bytes).
680     static const std::string kSizeAllMemTables;
681 
682     //  "rocksdb.num-entries-active-mem-table" - returns total number of entries
683     //      in the active memtable.
684     static const std::string kNumEntriesActiveMemTable;
685 
686     //  "rocksdb.num-entries-imm-mem-tables" - returns total number of entries
687     //      in the unflushed immutable memtables.
688     static const std::string kNumEntriesImmMemTables;
689 
690     //  "rocksdb.num-deletes-active-mem-table" - returns total number of delete
691     //      entries in the active memtable.
692     static const std::string kNumDeletesActiveMemTable;
693 
694     //  "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete
695     //      entries in the unflushed immutable memtables.
696     static const std::string kNumDeletesImmMemTables;
697 
698     //  "rocksdb.estimate-num-keys" - returns estimated number of total keys in
699     //      the active and unflushed immutable memtables and storage.
700     static const std::string kEstimateNumKeys;
701 
702     //  "rocksdb.estimate-table-readers-mem" - returns estimated memory used for
703     //      reading SST tables, excluding memory used in block cache (e.g.,
704     //      filter and index blocks).
705     static const std::string kEstimateTableReadersMem;
706 
707     //  "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete
708     //      files is enabled; otherwise, returns a non-zero number.
709     static const std::string kIsFileDeletionsEnabled;
710 
711     //  "rocksdb.num-snapshots" - returns number of unreleased snapshots of the
712     //      database.
713     static const std::string kNumSnapshots;
714 
715     //  "rocksdb.oldest-snapshot-time" - returns number representing unix
716     //      timestamp of oldest unreleased snapshot.
717     static const std::string kOldestSnapshotTime;
718 
719     //  "rocksdb.oldest-snapshot-sequence" - returns number representing
720     //      sequence number of oldest unreleased snapshot.
721     static const std::string kOldestSnapshotSequence;
722 
723     //  "rocksdb.num-live-versions" - returns number of live versions. `Version`
724     //      is an internal data structure. See version_set.h for details. More
725     //      live versions often mean more SST files are held from being deleted,
726     //      by iterators or unfinished compactions.
727     static const std::string kNumLiveVersions;
728 
729     //  "rocksdb.current-super-version-number" - returns number of current LSM
730     //  version. It is a uint64_t integer number, incremented after there is
731     //  any change to the LSM tree. The number is not preserved after restarting
732     //  the DB. After DB restart, it will start from 0 again.
733     static const std::string kCurrentSuperVersionNumber;
734 
735     //  "rocksdb.estimate-live-data-size" - returns an estimate of the amount of
736     //      live data in bytes.
737     static const std::string kEstimateLiveDataSize;
738 
739     //  "rocksdb.min-log-number-to-keep" - return the minimum log number of the
740     //      log files that should be kept.
741     static const std::string kMinLogNumberToKeep;
742 
743     //  "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file
744     //      number for an obsolete SST to be kept. The max value of `uint64_t`
745     //      will be returned if all obsolete files can be deleted.
746     static const std::string kMinObsoleteSstNumberToKeep;
747 
748     //  "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
749     //      files.
750     //  WARNING: may slow down online queries if there are too many files.
751     static const std::string kTotalSstFilesSize;
752 
753     //  "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST
754     //      files belong to the latest LSM tree.
755     static const std::string kLiveSstFilesSize;
756 
757     //  "rocksdb.base-level" - returns number of level to which L0 data will be
758     //      compacted.
759     static const std::string kBaseLevel;
760 
761     //  "rocksdb.estimate-pending-compaction-bytes" - returns estimated total
762     //      number of bytes compaction needs to rewrite to get all levels down
763     //      to under target size. Not valid for other compactions than level-
764     //      based.
765     static const std::string kEstimatePendingCompactionBytes;
766 
767     //  "rocksdb.aggregated-table-properties" - returns a string representation
768     //      of the aggregated table properties of the target column family.
769     static const std::string kAggregatedTableProperties;
770 
771     //  "rocksdb.aggregated-table-properties-at-level<N>", same as the previous
772     //      one but only returns the aggregated table properties of the
773     //      specified level "N" at the target column family.
774     static const std::string kAggregatedTablePropertiesAtLevel;
775 
776     //  "rocksdb.actual-delayed-write-rate" - returns the current actual delayed
777     //      write rate. 0 means no delay.
778     static const std::string kActualDelayedWriteRate;
779 
780     //  "rocksdb.is-write-stopped" - Return 1 if write has been stopped.
781     static const std::string kIsWriteStopped;
782 
783     //  "rocksdb.estimate-oldest-key-time" - returns an estimation of
784     //      oldest key timestamp in the DB. Currently only available for
785     //      FIFO compaction with
786     //      compaction_options_fifo.allow_compaction = false.
787     static const std::string kEstimateOldestKeyTime;
788 
789     //  "rocksdb.block-cache-capacity" - returns block cache capacity.
790     static const std::string kBlockCacheCapacity;
791 
792     //  "rocksdb.block-cache-usage" - returns the memory size for the entries
793     //      residing in block cache.
794     static const std::string kBlockCacheUsage;
795 
796     // "rocksdb.block-cache-pinned-usage" - returns the memory size for the
797     //      entries being pinned.
798     static const std::string kBlockCachePinnedUsage;
799 
800     // "rocksdb.options-statistics" - returns multi-line string
801     //      of options.statistics
802     static const std::string kOptionsStatistics;
803   };
804 #endif /* ROCKSDB_LITE */
805 
806   // DB implementations can export properties about their state via this method.
807   // If "property" is a valid property understood by this DB implementation (see
808   // Properties struct above for valid options), fills "*value" with its current
809   // value and returns true.  Otherwise, returns false.
810   virtual bool GetProperty(ColumnFamilyHandle* column_family,
811                            const Slice& property, std::string* value) = 0;
GetProperty(const Slice & property,std::string * value)812   virtual bool GetProperty(const Slice& property, std::string* value) {
813     return GetProperty(DefaultColumnFamily(), property, value);
814   }
815   virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
816                               const Slice& property,
817                               std::map<std::string, std::string>* value) = 0;
GetMapProperty(const Slice & property,std::map<std::string,std::string> * value)818   virtual bool GetMapProperty(const Slice& property,
819                               std::map<std::string, std::string>* value) {
820     return GetMapProperty(DefaultColumnFamily(), property, value);
821   }
822 
823   // Similar to GetProperty(), but only works for a subset of properties whose
824   // return value is an integer. Return the value by integer. Supported
825   // properties:
826   //  "rocksdb.num-immutable-mem-table"
827   //  "rocksdb.mem-table-flush-pending"
828   //  "rocksdb.compaction-pending"
829   //  "rocksdb.background-errors"
830   //  "rocksdb.cur-size-active-mem-table"
831   //  "rocksdb.cur-size-all-mem-tables"
832   //  "rocksdb.size-all-mem-tables"
833   //  "rocksdb.num-entries-active-mem-table"
834   //  "rocksdb.num-entries-imm-mem-tables"
835   //  "rocksdb.num-deletes-active-mem-table"
836   //  "rocksdb.num-deletes-imm-mem-tables"
837   //  "rocksdb.estimate-num-keys"
838   //  "rocksdb.estimate-table-readers-mem"
839   //  "rocksdb.is-file-deletions-enabled"
840   //  "rocksdb.num-snapshots"
841   //  "rocksdb.oldest-snapshot-time"
842   //  "rocksdb.num-live-versions"
843   //  "rocksdb.current-super-version-number"
844   //  "rocksdb.estimate-live-data-size"
845   //  "rocksdb.min-log-number-to-keep"
846   //  "rocksdb.min-obsolete-sst-number-to-keep"
847   //  "rocksdb.total-sst-files-size"
848   //  "rocksdb.live-sst-files-size"
849   //  "rocksdb.base-level"
850   //  "rocksdb.estimate-pending-compaction-bytes"
851   //  "rocksdb.num-running-compactions"
852   //  "rocksdb.num-running-flushes"
853   //  "rocksdb.actual-delayed-write-rate"
854   //  "rocksdb.is-write-stopped"
855   //  "rocksdb.estimate-oldest-key-time"
856   //  "rocksdb.block-cache-capacity"
857   //  "rocksdb.block-cache-usage"
858   //  "rocksdb.block-cache-pinned-usage"
859   virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
860                               const Slice& property, uint64_t* value) = 0;
GetIntProperty(const Slice & property,uint64_t * value)861   virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
862     return GetIntProperty(DefaultColumnFamily(), property, value);
863   }
864 
865   // Reset internal stats for DB and all column families.
866   // Note this doesn't reset options.statistics as it is not owned by
867   // DB.
ResetStats()868   virtual Status ResetStats() {
869     return Status::NotSupported("Not implemented");
870   }
871 
872   // Same as GetIntProperty(), but this one returns the aggregated int
873   // property from all column families.
874   virtual bool GetAggregatedIntProperty(const Slice& property,
875                                         uint64_t* value) = 0;
876 
877   // Flags for DB::GetSizeApproximation that specify whether memtable
878   // stats should be included, or file stats approximation or both
879   enum SizeApproximationFlags : uint8_t {
880     NONE = 0,
881     INCLUDE_MEMTABLES = 1 << 0,
882     INCLUDE_FILES = 1 << 1
883   };
884 
885   // For each i in [0,n-1], store in "sizes[i]", the approximate
886   // file system space used by keys in "[range[i].start .. range[i].limit)".
887   //
888   // Note that the returned sizes measure file system space usage, so
889   // if the user data compresses by a factor of ten, the returned
890   // sizes will be one-tenth the size of the corresponding user data size.
891   virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
892                                      ColumnFamilyHandle* column_family,
893                                      const Range* range, int n,
894                                      uint64_t* sizes) = 0;
895 
896   // Simpler versions of the GetApproximateSizes() method above.
897   // The include_flags argumenbt must of type DB::SizeApproximationFlags
898   // and can not be NONE.
899   virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
900                                    const Range* range, int n, uint64_t* sizes,
901                                    uint8_t include_flags = INCLUDE_FILES) {
902     SizeApproximationOptions options;
903     options.include_memtabtles =
904         (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0;
905     options.include_files =
906         (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0;
907     GetApproximateSizes(options, column_family, range, n, sizes);
908   }
909   virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes,
910                                    uint8_t include_flags = INCLUDE_FILES) {
911     GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
912   }
913 
914   // The method is similar to GetApproximateSizes, except it
915   // returns approximate number of records in memtables.
916   virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
917                                            const Range& range,
918                                            uint64_t* const count,
919                                            uint64_t* const size) = 0;
GetApproximateMemTableStats(const Range & range,uint64_t * const count,uint64_t * const size)920   virtual void GetApproximateMemTableStats(const Range& range,
921                                            uint64_t* const count,
922                                            uint64_t* const size) {
923     GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size);
924   }
925 
926   // Deprecated versions of GetApproximateSizes
GetApproximateSizes(const Range * range,int n,uint64_t * sizes,bool include_memtable)927   ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes(
928       const Range* range, int n, uint64_t* sizes, bool include_memtable) {
929     uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES;
930     if (include_memtable) {
931       include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES;
932     }
933     GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
934   }
GetApproximateSizes(ColumnFamilyHandle * column_family,const Range * range,int n,uint64_t * sizes,bool include_memtable)935   ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes(
936       ColumnFamilyHandle* column_family, const Range* range, int n,
937       uint64_t* sizes, bool include_memtable) {
938     uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES;
939     if (include_memtable) {
940       include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES;
941     }
942     GetApproximateSizes(column_family, range, n, sizes, include_flags);
943   }
944 
945   // Compact the underlying storage for the key range [*begin,*end].
946   // The actual compaction interval might be superset of [*begin, *end].
947   // In particular, deleted and overwritten versions are discarded,
948   // and the data is rearranged to reduce the cost of operations
949   // needed to access the data.  This operation should typically only
950   // be invoked by users who understand the underlying implementation.
951   //
952   // begin==nullptr is treated as a key before all keys in the database.
953   // end==nullptr is treated as a key after all keys in the database.
954   // Therefore the following call will compact the entire database:
955   //    db->CompactRange(options, nullptr, nullptr);
956   // Note that after the entire database is compacted, all data are pushed
957   // down to the last level containing any data. If the total data size after
958   // compaction is reduced, that level might not be appropriate for hosting all
959   // the files. In this case, client could set options.change_level to true, to
960   // move the files back to the minimum level capable of holding the data set
961   // or a given level (specified by non-negative options.target_level).
962   virtual Status CompactRange(const CompactRangeOptions& options,
963                               ColumnFamilyHandle* column_family,
964                               const Slice* begin, const Slice* end) = 0;
CompactRange(const CompactRangeOptions & options,const Slice * begin,const Slice * end)965   virtual Status CompactRange(const CompactRangeOptions& options,
966                               const Slice* begin, const Slice* end) {
967     return CompactRange(options, DefaultColumnFamily(), begin, end);
968   }
969 
970   ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange(
971       ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end,
972       bool change_level = false, int target_level = -1,
973       uint32_t target_path_id = 0) {
974     CompactRangeOptions options;
975     options.change_level = change_level;
976     options.target_level = target_level;
977     options.target_path_id = target_path_id;
978     return CompactRange(options, column_family, begin, end);
979   }
980 
981   ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange(
982       const Slice* begin, const Slice* end, bool change_level = false,
983       int target_level = -1, uint32_t target_path_id = 0) {
984     CompactRangeOptions options;
985     options.change_level = change_level;
986     options.target_level = target_level;
987     options.target_path_id = target_path_id;
988     return CompactRange(options, DefaultColumnFamily(), begin, end);
989   }
990 
SetOptions(ColumnFamilyHandle *,const std::unordered_map<std::string,std::string> &)991   virtual Status SetOptions(
992       ColumnFamilyHandle* /*column_family*/,
993       const std::unordered_map<std::string, std::string>& /*new_options*/) {
994     return Status::NotSupported("Not implemented");
995   }
SetOptions(const std::unordered_map<std::string,std::string> & new_options)996   virtual Status SetOptions(
997       const std::unordered_map<std::string, std::string>& new_options) {
998     return SetOptions(DefaultColumnFamily(), new_options);
999   }
1000 
1001   virtual Status SetDBOptions(
1002       const std::unordered_map<std::string, std::string>& new_options) = 0;
1003 
1004   // CompactFiles() inputs a list of files specified by file numbers and
1005   // compacts them to the specified level. Note that the behavior is different
1006   // from CompactRange() in that CompactFiles() performs the compaction job
1007   // using the CURRENT thread.
1008   //
1009   // @see GetDataBaseMetaData
1010   // @see GetColumnFamilyMetaData
1011   virtual Status CompactFiles(
1012       const CompactionOptions& compact_options,
1013       ColumnFamilyHandle* column_family,
1014       const std::vector<std::string>& input_file_names, const int output_level,
1015       const int output_path_id = -1,
1016       std::vector<std::string>* const output_file_names = nullptr,
1017       CompactionJobInfo* compaction_job_info = nullptr) = 0;
1018 
1019   virtual Status CompactFiles(
1020       const CompactionOptions& compact_options,
1021       const std::vector<std::string>& input_file_names, const int output_level,
1022       const int output_path_id = -1,
1023       std::vector<std::string>* const output_file_names = nullptr,
1024       CompactionJobInfo* compaction_job_info = nullptr) {
1025     return CompactFiles(compact_options, DefaultColumnFamily(),
1026                         input_file_names, output_level, output_path_id,
1027                         output_file_names, compaction_job_info);
1028   }
1029 
1030   // This function will wait until all currently running background processes
1031   // finish. After it returns, no background process will be run until
1032   // ContinueBackgroundWork is called
1033   virtual Status PauseBackgroundWork() = 0;
1034   virtual Status ContinueBackgroundWork() = 0;
1035 
1036   // This function will enable automatic compactions for the given column
1037   // families if they were previously disabled. The function will first set the
1038   // disable_auto_compactions option for each column family to 'false', after
1039   // which it will schedule a flush/compaction.
1040   //
1041   // NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API
1042   // does NOT schedule a flush/compaction afterwards, and only changes the
1043   // parameter itself within the column family option.
1044   //
1045   virtual Status EnableAutoCompaction(
1046       const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0;
1047 
1048   virtual void DisableManualCompaction() = 0;
1049   virtual void EnableManualCompaction() = 0;
1050 
1051   // Number of levels used for this DB.
1052   virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
NumberLevels()1053   virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
1054 
1055   // Maximum level to which a new compacted memtable is pushed if it
1056   // does not create overlap.
1057   virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
MaxMemCompactionLevel()1058   virtual int MaxMemCompactionLevel() {
1059     return MaxMemCompactionLevel(DefaultColumnFamily());
1060   }
1061 
1062   // Number of files in level-0 that would stop writes.
1063   virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
Level0StopWriteTrigger()1064   virtual int Level0StopWriteTrigger() {
1065     return Level0StopWriteTrigger(DefaultColumnFamily());
1066   }
1067 
1068   // Get DB name -- the exact same name that was provided as an argument to
1069   // DB::Open()
1070   virtual const std::string& GetName() const = 0;
1071 
1072   // Get Env object from the DB
1073   virtual Env* GetEnv() const = 0;
1074 
1075   virtual FileSystem* GetFileSystem() const;
1076 
1077   // Get DB Options that we use.  During the process of opening the
1078   // column family, the options provided when calling DB::Open() or
1079   // DB::CreateColumnFamily() will have been "sanitized" and transformed
1080   // in an implementation-defined manner.
1081   virtual Options GetOptions(ColumnFamilyHandle* column_family) const = 0;
GetOptions()1082   virtual Options GetOptions() const {
1083     return GetOptions(DefaultColumnFamily());
1084   }
1085 
1086   virtual DBOptions GetDBOptions() const = 0;
1087 
1088   // Flush all mem-table data.
1089   // Flush a single column family, even when atomic flush is enabled. To flush
1090   // multiple column families, use Flush(options, column_families).
1091   virtual Status Flush(const FlushOptions& options,
1092                        ColumnFamilyHandle* column_family) = 0;
Flush(const FlushOptions & options)1093   virtual Status Flush(const FlushOptions& options) {
1094     return Flush(options, DefaultColumnFamily());
1095   }
1096   // Flushes multiple column families.
1097   // If atomic flush is not enabled, Flush(options, column_families) is
1098   // equivalent to calling Flush(options, column_family) multiple times.
1099   // If atomic flush is enabled, Flush(options, column_families) will flush all
1100   // column families specified in 'column_families' up to the latest sequence
1101   // number at the time when flush is requested.
1102   // Note that RocksDB 5.15 and earlier may not be able to open later versions
1103   // with atomic flush enabled.
1104   virtual Status Flush(
1105       const FlushOptions& options,
1106       const std::vector<ColumnFamilyHandle*>& column_families) = 0;
1107 
1108   // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL
1109   // afterwards.
FlushWAL(bool)1110   virtual Status FlushWAL(bool /*sync*/) {
1111     return Status::NotSupported("FlushWAL not implemented");
1112   }
1113   // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the
1114   // same as Write() with sync=true: in the latter case the changes won't be
1115   // visible until the sync is done.
1116   // Currently only works if allow_mmap_writes = false in Options.
1117   virtual Status SyncWAL() = 0;
1118 
1119   // Lock the WAL. Also flushes the WAL after locking.
LockWAL()1120   virtual Status LockWAL() {
1121     return Status::NotSupported("LockWAL not implemented");
1122   }
1123 
1124   // Unlock the WAL.
UnlockWAL()1125   virtual Status UnlockWAL() {
1126     return Status::NotSupported("UnlockWAL not implemented");
1127   }
1128 
1129   // The sequence number of the most recent transaction.
1130   virtual SequenceNumber GetLatestSequenceNumber() const = 0;
1131 
1132   // Instructs DB to preserve deletes with sequence numbers >= passed seqnum.
1133   // Has no effect if DBOptions.preserve_deletes is set to false.
1134   // This function assumes that user calls this function with monotonically
1135   // increasing seqnums (otherwise we can't guarantee that a particular delete
1136   // hasn't been already processed); returns true if the value was successfully
1137   // updated, false if user attempted to call if with seqnum <= current value.
1138   virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) = 0;
1139 
1140 #ifndef ROCKSDB_LITE
1141 
1142   // Prevent file deletions. Compactions will continue to occur,
1143   // but no obsolete files will be deleted. Calling this multiple
1144   // times have the same effect as calling it once.
1145   virtual Status DisableFileDeletions() = 0;
1146 
1147   // Allow compactions to delete obsolete files.
1148   // If force == true, the call to EnableFileDeletions() will guarantee that
1149   // file deletions are enabled after the call, even if DisableFileDeletions()
1150   // was called multiple times before.
1151   // If force == false, EnableFileDeletions will only enable file deletion
1152   // after it's been called at least as many times as DisableFileDeletions(),
1153   // enabling the two methods to be called by two threads concurrently without
1154   // synchronization -- i.e., file deletions will be enabled only after both
1155   // threads call EnableFileDeletions()
1156   virtual Status EnableFileDeletions(bool force = true) = 0;
1157 
1158   // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
1159 
1160   // Retrieve the list of all files in the database. The files are
1161   // relative to the dbname and are not absolute paths. Despite being relative
1162   // paths, the file names begin with "/". The valid size of the manifest file
1163   // is returned in manifest_file_size. The manifest file is an ever growing
1164   // file, but only the portion specified by manifest_file_size is valid for
1165   // this snapshot. Setting flush_memtable to true does Flush before recording
1166   // the live files. Setting flush_memtable to false is useful when we don't
1167   // want to wait for flush which may have to wait for compaction to complete
1168   // taking an indeterminate time.
1169   //
1170   // In case you have multiple column families, even if flush_memtable is true,
1171   // you still need to call GetSortedWalFiles after GetLiveFiles to compensate
1172   // for new data that arrived to already-flushed column families while other
1173   // column families were flushing
1174   virtual Status GetLiveFiles(std::vector<std::string>&,
1175                               uint64_t* manifest_file_size,
1176                               bool flush_memtable = true) = 0;
1177 
1178   // Retrieve the sorted list of all wal files with earliest file first
1179   virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
1180 
1181   // Retrieve information about the current wal file
1182   //
1183   // Note that the log might have rolled after this call in which case
1184   // the current_log_file would not point to the current log file.
1185   //
1186   // Additionally, for the sake of optimization current_log_file->StartSequence
1187   // would always be set to 0
1188   virtual Status GetCurrentWalFile(
1189       std::unique_ptr<LogFile>* current_log_file) = 0;
1190 
1191   // Retrieves the creation time of the oldest file in the DB.
1192   // This API only works if max_open_files = -1, if it is not then
1193   // Status returned is Status::NotSupported()
1194   // The file creation time is set using the env provided to the DB.
1195   // If the DB was created from a very old release then its possible that
1196   // the SST files might not have file_creation_time property and even after
1197   // moving to a newer release its possible that some files never got compacted
1198   // and may not have file_creation_time property. In both the cases
1199   // file_creation_time is considered 0 which means this API will return
1200   // creation_time = 0 as there wouldn't be a timestamp lower than 0.
1201   virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0;
1202 
1203   // Note: this API is not yet consistent with WritePrepared transactions.
1204   // Sets iter to an iterator that is positioned at a write-batch containing
1205   // seq_number. If the sequence number is non existent, it returns an iterator
1206   // at the first available seq_no after the requested seq_no
1207   // Returns Status::OK if iterator is valid
1208   // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
1209   // use this api, else the WAL files will get
1210   // cleared aggressively and the iterator might keep getting invalid before
1211   // an update is read.
1212   virtual Status GetUpdatesSince(
1213       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
1214       const TransactionLogIterator::ReadOptions& read_options =
1215           TransactionLogIterator::ReadOptions()) = 0;
1216 
1217 // Windows API macro interference
1218 #undef DeleteFile
1219   // Delete the file name from the db directory and update the internal state to
1220   // reflect that. Supports deletion of sst and log files only. 'name' must be
1221   // path relative to the db directory. eg. 000001.sst, /archive/000003.log
1222   virtual Status DeleteFile(std::string name) = 0;
1223 
1224   // Returns a list of all table files with their level, start key
1225   // and end key
GetLiveFilesMetaData(std::vector<LiveFileMetaData> *)1226   virtual void GetLiveFilesMetaData(
1227       std::vector<LiveFileMetaData>* /*metadata*/) {}
1228 
1229   // Obtains the meta data of the specified column family of the DB.
GetColumnFamilyMetaData(ColumnFamilyHandle *,ColumnFamilyMetaData *)1230   virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
1231                                        ColumnFamilyMetaData* /*metadata*/) {}
1232 
1233   // Get the metadata of the default column family.
GetColumnFamilyMetaData(ColumnFamilyMetaData * metadata)1234   void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) {
1235     GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
1236   }
1237 
1238   // IngestExternalFile() will load a list of external SST files (1) into the DB
1239   // Two primary modes are supported:
1240   // - Duplicate keys in the new files will overwrite exiting keys (default)
1241   // - Duplicate keys will be skipped (set ingest_behind=true)
1242   // In the first mode we will try to find the lowest possible level that
1243   // the file can fit in, and ingest the file into this level (2). A file that
1244   // have a key range that overlap with the memtable key range will require us
1245   // to Flush the memtable first before ingesting the file.
1246   // In the second mode we will always ingest in the bottom most level (see
1247   // docs to IngestExternalFileOptions::ingest_behind).
1248   //
1249   // (1) External SST files can be created using SstFileWriter
1250   // (2) We will try to ingest the files to the lowest possible level
1251   //     even if the file compression doesn't match the level compression
1252   // (3) If IngestExternalFileOptions->ingest_behind is set to true,
1253   //     we always ingest at the bottommost level, which should be reserved
1254   //     for this purpose (see DBOPtions::allow_ingest_behind flag).
1255   virtual Status IngestExternalFile(
1256       ColumnFamilyHandle* column_family,
1257       const std::vector<std::string>& external_files,
1258       const IngestExternalFileOptions& options) = 0;
1259 
IngestExternalFile(const std::vector<std::string> & external_files,const IngestExternalFileOptions & options)1260   virtual Status IngestExternalFile(
1261       const std::vector<std::string>& external_files,
1262       const IngestExternalFileOptions& options) {
1263     return IngestExternalFile(DefaultColumnFamily(), external_files, options);
1264   }
1265 
1266   // IngestExternalFiles() will ingest files for multiple column families, and
1267   // record the result atomically to the MANIFEST.
1268   // If this function returns OK, all column families' ingestion must succeed.
1269   // If this function returns NOK, or the process crashes, then non-of the
1270   // files will be ingested into the database after recovery.
1271   // Note that it is possible for application to observe a mixed state during
1272   // the execution of this function. If the user performs range scan over the
1273   // column families with iterators, iterator on one column family may return
1274   // ingested data, while iterator on other column family returns old data.
1275   // Users can use snapshot for a consistent view of data.
1276   // If your db ingests multiple SST files using this API, i.e. args.size()
1277   // > 1, then RocksDB 5.15 and earlier will not be able to open it.
1278   //
1279   // REQUIRES: each arg corresponds to a different column family: namely, for
1280   // 0 <= i < j < len(args), args[i].column_family != args[j].column_family.
1281   virtual Status IngestExternalFiles(
1282       const std::vector<IngestExternalFileArg>& args) = 0;
1283 
1284   // CreateColumnFamilyWithImport() will create a new column family with
1285   // column_family_name and import external SST files specified in metadata into
1286   // this column family.
1287   // (1) External SST files can be created using SstFileWriter.
1288   // (2) External SST files can be exported from a particular column family in
1289   //     an existing DB.
1290   // Option in import_options specifies whether the external files are copied or
1291   // moved (default is copy). When option specifies copy, managing files at
1292   // external_file_path is caller's responsibility. When option specifies a
1293   // move, the call ensures that the specified files at external_file_path are
1294   // deleted on successful return and files are not modified on any error
1295   // return.
1296   // On error return, column family handle returned will be nullptr.
1297   // ColumnFamily will be present on successful return and will not be present
1298   // on error return. ColumnFamily may be present on any crash during this call.
1299   virtual Status CreateColumnFamilyWithImport(
1300       const ColumnFamilyOptions& options, const std::string& column_family_name,
1301       const ImportColumnFamilyOptions& import_options,
1302       const ExportImportFilesMetaData& metadata,
1303       ColumnFamilyHandle** handle) = 0;
1304 
1305   virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
1306 
VerifyChecksum()1307   virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
1308 
1309   // AddFile() is deprecated, please use IngestExternalFile()
1310   ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1311       ColumnFamilyHandle* column_family,
1312       const std::vector<std::string>& file_path_list, bool move_file = false,
1313       bool skip_snapshot_check = false) {
1314     IngestExternalFileOptions ifo;
1315     ifo.move_files = move_file;
1316     ifo.snapshot_consistency = !skip_snapshot_check;
1317     ifo.allow_global_seqno = false;
1318     ifo.allow_blocking_flush = false;
1319     return IngestExternalFile(column_family, file_path_list, ifo);
1320   }
1321 
1322   ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1323       const std::vector<std::string>& file_path_list, bool move_file = false,
1324       bool skip_snapshot_check = false) {
1325     IngestExternalFileOptions ifo;
1326     ifo.move_files = move_file;
1327     ifo.snapshot_consistency = !skip_snapshot_check;
1328     ifo.allow_global_seqno = false;
1329     ifo.allow_blocking_flush = false;
1330     return IngestExternalFile(DefaultColumnFamily(), file_path_list, ifo);
1331   }
1332 
1333   // AddFile() is deprecated, please use IngestExternalFile()
1334   ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1335       ColumnFamilyHandle* column_family, const std::string& file_path,
1336       bool move_file = false, bool skip_snapshot_check = false) {
1337     IngestExternalFileOptions ifo;
1338     ifo.move_files = move_file;
1339     ifo.snapshot_consistency = !skip_snapshot_check;
1340     ifo.allow_global_seqno = false;
1341     ifo.allow_blocking_flush = false;
1342     return IngestExternalFile(column_family, {file_path}, ifo);
1343   }
1344 
1345   ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1346       const std::string& file_path, bool move_file = false,
1347       bool skip_snapshot_check = false) {
1348     IngestExternalFileOptions ifo;
1349     ifo.move_files = move_file;
1350     ifo.snapshot_consistency = !skip_snapshot_check;
1351     ifo.allow_global_seqno = false;
1352     ifo.allow_blocking_flush = false;
1353     return IngestExternalFile(DefaultColumnFamily(), {file_path}, ifo);
1354   }
1355 
1356   // Load table file with information "file_info" into "column_family"
1357   ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1358       ColumnFamilyHandle* column_family,
1359       const std::vector<ExternalSstFileInfo>& file_info_list,
1360       bool move_file = false, bool skip_snapshot_check = false) {
1361     std::vector<std::string> external_files;
1362     for (const ExternalSstFileInfo& file_info : file_info_list) {
1363       external_files.push_back(file_info.file_path);
1364     }
1365     IngestExternalFileOptions ifo;
1366     ifo.move_files = move_file;
1367     ifo.snapshot_consistency = !skip_snapshot_check;
1368     ifo.allow_global_seqno = false;
1369     ifo.allow_blocking_flush = false;
1370     return IngestExternalFile(column_family, external_files, ifo);
1371   }
1372 
1373   ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1374       const std::vector<ExternalSstFileInfo>& file_info_list,
1375       bool move_file = false, bool skip_snapshot_check = false) {
1376     std::vector<std::string> external_files;
1377     for (const ExternalSstFileInfo& file_info : file_info_list) {
1378       external_files.push_back(file_info.file_path);
1379     }
1380     IngestExternalFileOptions ifo;
1381     ifo.move_files = move_file;
1382     ifo.snapshot_consistency = !skip_snapshot_check;
1383     ifo.allow_global_seqno = false;
1384     ifo.allow_blocking_flush = false;
1385     return IngestExternalFile(DefaultColumnFamily(), external_files, ifo);
1386   }
1387 
1388   ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1389       ColumnFamilyHandle* column_family, const ExternalSstFileInfo* file_info,
1390       bool move_file = false, bool skip_snapshot_check = false) {
1391     IngestExternalFileOptions ifo;
1392     ifo.move_files = move_file;
1393     ifo.snapshot_consistency = !skip_snapshot_check;
1394     ifo.allow_global_seqno = false;
1395     ifo.allow_blocking_flush = false;
1396     return IngestExternalFile(column_family, {file_info->file_path}, ifo);
1397   }
1398 
1399   ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
1400       const ExternalSstFileInfo* file_info, bool move_file = false,
1401       bool skip_snapshot_check = false) {
1402     IngestExternalFileOptions ifo;
1403     ifo.move_files = move_file;
1404     ifo.snapshot_consistency = !skip_snapshot_check;
1405     ifo.allow_global_seqno = false;
1406     ifo.allow_blocking_flush = false;
1407     return IngestExternalFile(DefaultColumnFamily(), {file_info->file_path},
1408                               ifo);
1409   }
1410 
1411 #endif  // ROCKSDB_LITE
1412 
1413   // Returns the unique ID which is read from IDENTITY file during the opening
1414   // of database by setting in the identity variable
1415   // Returns Status::OK if identity could be set properly
1416   virtual Status GetDbIdentity(std::string& identity) const = 0;
1417 
1418   // Returns default column family handle
1419   virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
1420 
1421 #ifndef ROCKSDB_LITE
1422   virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
1423                                           TablePropertiesCollection* props) = 0;
GetPropertiesOfAllTables(TablePropertiesCollection * props)1424   virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
1425     return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
1426   }
1427   virtual Status GetPropertiesOfTablesInRange(
1428       ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
1429       TablePropertiesCollection* props) = 0;
1430 
SuggestCompactRange(ColumnFamilyHandle *,const Slice *,const Slice *)1431   virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/,
1432                                      const Slice* /*begin*/,
1433                                      const Slice* /*end*/) {
1434     return Status::NotSupported("SuggestCompactRange() is not implemented.");
1435   }
1436 
PromoteL0(ColumnFamilyHandle *,int)1437   virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/,
1438                            int /*target_level*/) {
1439     return Status::NotSupported("PromoteL0() is not implemented.");
1440   }
1441 
1442   // Trace DB operations. Use EndTrace() to stop tracing.
StartTrace(const TraceOptions &,std::unique_ptr<TraceWriter> &&)1443   virtual Status StartTrace(const TraceOptions& /*options*/,
1444                             std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
1445     return Status::NotSupported("StartTrace() is not implemented.");
1446   }
1447 
EndTrace()1448   virtual Status EndTrace() {
1449     return Status::NotSupported("EndTrace() is not implemented.");
1450   }
1451 
1452   // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
StartBlockCacheTrace(const TraceOptions &,std::unique_ptr<TraceWriter> &&)1453   virtual Status StartBlockCacheTrace(
1454       const TraceOptions& /*options*/,
1455       std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
1456     return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
1457   }
1458 
EndBlockCacheTrace()1459   virtual Status EndBlockCacheTrace() {
1460     return Status::NotSupported("EndBlockCacheTrace() is not implemented.");
1461   }
1462 #endif  // ROCKSDB_LITE
1463 
1464   // Needed for StackableDB
GetRootDB()1465   virtual DB* GetRootDB() { return this; }
1466 
1467   // Given a window [start_time, end_time), setup a StatsHistoryIterator
1468   // to access stats history. Note the start_time and end_time are epoch
1469   // time measured in seconds, and end_time is an exclusive bound.
GetStatsHistory(uint64_t,uint64_t,std::unique_ptr<StatsHistoryIterator> *)1470   virtual Status GetStatsHistory(
1471       uint64_t /*start_time*/, uint64_t /*end_time*/,
1472       std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) {
1473     return Status::NotSupported("GetStatsHistory() is not implemented.");
1474   }
1475 
1476 #ifndef ROCKSDB_LITE
1477   // Make the secondary instance catch up with the primary by tailing and
1478   // replaying the MANIFEST and WAL of the primary.
1479   // Column families created by the primary after the secondary instance starts
1480   // will be ignored unless the secondary instance closes and restarts with the
1481   // newly created column families.
1482   // Column families that exist before secondary instance starts and dropped by
1483   // the primary afterwards will be marked as dropped. However, as long as the
1484   // secondary instance does not delete the corresponding column family
1485   // handles, the data of the column family is still accessible to the
1486   // secondary.
1487   // TODO: we will support WAL tailing soon.
TryCatchUpWithPrimary()1488   virtual Status TryCatchUpWithPrimary() {
1489     return Status::NotSupported("Supported only by secondary instance");
1490   }
1491 #endif  // !ROCKSDB_LITE
1492 };
1493 
1494 // Destroy the contents of the specified database.
1495 // Be very careful using this method.
1496 Status DestroyDB(const std::string& name, const Options& options,
1497                  const std::vector<ColumnFamilyDescriptor>& column_families =
1498                      std::vector<ColumnFamilyDescriptor>());
1499 
1500 #ifndef ROCKSDB_LITE
1501 // If a DB cannot be opened, you may attempt to call this method to
1502 // resurrect as much of the contents of the database as possible.
1503 // Some data may be lost, so be careful when calling this function
1504 // on a database that contains important information.
1505 //
1506 // With this API, we will warn and skip data associated with column families not
1507 // specified in column_families.
1508 //
1509 // @param column_families Descriptors for known column families
1510 Status RepairDB(const std::string& dbname, const DBOptions& db_options,
1511                 const std::vector<ColumnFamilyDescriptor>& column_families);
1512 
1513 // @param unknown_cf_opts Options for column families encountered during the
1514 //                        repair that were not specified in column_families.
1515 Status RepairDB(const std::string& dbname, const DBOptions& db_options,
1516                 const std::vector<ColumnFamilyDescriptor>& column_families,
1517                 const ColumnFamilyOptions& unknown_cf_opts);
1518 
1519 // @param options These options will be used for the database and for ALL column
1520 //                families encountered during the repair
1521 Status RepairDB(const std::string& dbname, const Options& options);
1522 
1523 #endif
1524 
1525 }  // namespace ROCKSDB_NAMESPACE
1526