1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE file. See the AUTHORS file for names of contributors. 8 9 #pragma once 10 11 #include <stdint.h> 12 #include <stdio.h> 13 #include <map> 14 #include <memory> 15 #include <string> 16 #include <unordered_map> 17 #include <vector> 18 #include "rocksdb/iterator.h" 19 #include "rocksdb/listener.h" 20 #include "rocksdb/metadata.h" 21 #include "rocksdb/options.h" 22 #include "rocksdb/snapshot.h" 23 #include "rocksdb/sst_file_writer.h" 24 #include "rocksdb/thread_status.h" 25 #include "rocksdb/transaction_log.h" 26 #include "rocksdb/types.h" 27 #include "rocksdb/version.h" 28 29 #ifdef _WIN32 30 // Windows API macro interference 31 #undef DeleteFile 32 #endif 33 34 #if defined(__GNUC__) || defined(__clang__) 35 #define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__)) 36 #elif _WIN32 37 #define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated) 38 #endif 39 40 namespace ROCKSDB_NAMESPACE { 41 42 struct Options; 43 struct DBOptions; 44 struct ColumnFamilyOptions; 45 struct ReadOptions; 46 struct WriteOptions; 47 struct FlushOptions; 48 struct CompactionOptions; 49 struct CompactRangeOptions; 50 struct TableProperties; 51 struct ExternalSstFileInfo; 52 class WriteBatch; 53 class Env; 54 class EventListener; 55 class StatsHistoryIterator; 56 class TraceWriter; 57 #ifdef ROCKSDB_LITE 58 class CompactionJobInfo; 59 #endif 60 class FileSystem; 61 62 extern const std::string kDefaultColumnFamilyName; 63 extern const std::string kPersistentStatsColumnFamilyName; 64 struct ColumnFamilyDescriptor { 65 std::string name; 66 ColumnFamilyOptions options; ColumnFamilyDescriptorColumnFamilyDescriptor67 ColumnFamilyDescriptor() 68 : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {} ColumnFamilyDescriptorColumnFamilyDescriptor69 ColumnFamilyDescriptor(const std::string& _name, 70 const ColumnFamilyOptions& _options) 71 : name(_name), options(_options) {} 72 }; 73 74 class ColumnFamilyHandle { 75 public: ~ColumnFamilyHandle()76 virtual ~ColumnFamilyHandle() {} 77 // Returns the name of the column family associated with the current handle. 78 virtual const std::string& GetName() const = 0; 79 // Returns the ID of the column family associated with the current handle. 80 virtual uint32_t GetID() const = 0; 81 // Fills "*desc" with the up-to-date descriptor of the column family 82 // associated with this handle. Since it fills "*desc" with the up-to-date 83 // information, this call might internally lock and release DB mutex to 84 // access the up-to-date CF options. In addition, all the pointer-typed 85 // options cannot be referenced any longer than the original options exist. 86 // 87 // Note that this function is not supported in RocksDBLite. 88 virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0; 89 // Returns the comparator of the column family associated with the 90 // current handle. 91 virtual const Comparator* GetComparator() const = 0; 92 }; 93 94 static const int kMajorVersion = __ROCKSDB_MAJOR__; 95 static const int kMinorVersion = __ROCKSDB_MINOR__; 96 97 // A range of keys 98 struct Range { 99 Slice start; 100 Slice limit; 101 RangeRange102 Range() {} RangeRange103 Range(const Slice& s, const Slice& l) : start(s), limit(l) {} 104 }; 105 106 struct RangePtr { 107 const Slice* start; 108 const Slice* limit; 109 RangePtrRangePtr110 RangePtr() : start(nullptr), limit(nullptr) {} RangePtrRangePtr111 RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {} 112 }; 113 114 struct IngestExternalFileArg { 115 ColumnFamilyHandle* column_family = nullptr; 116 std::vector<std::string> external_files; 117 IngestExternalFileOptions options; 118 }; 119 120 struct GetMergeOperandsOptions { 121 int expected_max_number_of_operands = 0; 122 }; 123 124 // A collections of table properties objects, where 125 // key: is the table's file name. 126 // value: the table properties object of the given table. 127 typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>> 128 TablePropertiesCollection; 129 130 // A DB is a persistent ordered map from keys to values. 131 // A DB is safe for concurrent access from multiple threads without 132 // any external synchronization. 133 class DB { 134 public: 135 // Open the database with the specified "name". 136 // Stores a pointer to a heap-allocated database in *dbptr and returns 137 // OK on success. 138 // Stores nullptr in *dbptr and returns a non-OK status on error. 139 // Caller should delete *dbptr when it is no longer needed. 140 static Status Open(const Options& options, const std::string& name, 141 DB** dbptr); 142 143 // Open the database for read only. All DB interfaces 144 // that modify data, like put/delete, will return error. 145 // If the db is opened in read only mode, then no compactions 146 // will happen. 147 // 148 // Not supported in ROCKSDB_LITE, in which case the function will 149 // return Status::NotSupported. 150 static Status OpenForReadOnly(const Options& options, const std::string& name, 151 DB** dbptr, 152 bool error_if_log_file_exist = false); 153 154 // Open the database for read only with column families. When opening DB with 155 // read only, you can specify only a subset of column families in the 156 // database that should be opened. However, you always need to specify default 157 // column family. The default column family name is 'default' and it's stored 158 // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName 159 // 160 // Not supported in ROCKSDB_LITE, in which case the function will 161 // return Status::NotSupported. 162 static Status OpenForReadOnly( 163 const DBOptions& db_options, const std::string& name, 164 const std::vector<ColumnFamilyDescriptor>& column_families, 165 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr, 166 bool error_if_log_file_exist = false); 167 168 // The following OpenAsSecondary functions create a secondary instance that 169 // can dynamically tail the MANIFEST of a primary that must have already been 170 // created. User can call TryCatchUpWithPrimary to make the secondary 171 // instance catch up with primary (WAL tailing is NOT supported now) whenever 172 // the user feels necessary. Column families created by the primary after the 173 // secondary instance starts are currently ignored by the secondary instance. 174 // Column families opened by secondary and dropped by the primary will be 175 // dropped by secondary as well. However the user of the secondary instance 176 // can still access the data of such dropped column family as long as they 177 // do not destroy the corresponding column family handle. 178 // WAL tailing is not supported at present, but will arrive soon. 179 // 180 // The options argument specifies the options to open the secondary instance. 181 // The name argument specifies the name of the primary db that you have used 182 // to open the primary instance. 183 // The secondary_path argument points to a directory where the secondary 184 // instance stores its info log. 185 // The dbptr is an out-arg corresponding to the opened secondary instance. 186 // The pointer points to a heap-allocated database, and the user should 187 // delete it after use. 188 // Open DB as secondary instance with only the default column family. 189 // Return OK on success, non-OK on failures. 190 static Status OpenAsSecondary(const Options& options, const std::string& name, 191 const std::string& secondary_path, DB** dbptr); 192 193 // Open DB as secondary instance with column families. You can open a subset 194 // of column families in secondary mode. 195 // The db_options specify the database specific options. 196 // The name argument specifies the name of the primary db that you have used 197 // to open the primary instance. 198 // The secondary_path argument points to a directory where the secondary 199 // instance stores its info log. 200 // The column_families argument specifieds a list of column families to open. 201 // If any of the column families does not exist, the function returns non-OK 202 // status. 203 // The handles is an out-arg corresponding to the opened database column 204 // familiy handles. 205 // The dbptr is an out-arg corresponding to the opened secondary instance. 206 // The pointer points to a heap-allocated database, and the caller should 207 // delete it after use. Before deleting the dbptr, the user should also 208 // delete the pointers stored in handles vector. 209 // Return OK on success, on-OK on failures. 210 static Status OpenAsSecondary( 211 const DBOptions& db_options, const std::string& name, 212 const std::string& secondary_path, 213 const std::vector<ColumnFamilyDescriptor>& column_families, 214 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr); 215 216 // Open DB with column families. 217 // db_options specify database specific options 218 // column_families is the vector of all column families in the database, 219 // containing column family name and options. You need to open ALL column 220 // families in the database. To get the list of column families, you can use 221 // ListColumnFamilies(). Also, you can open only a subset of column families 222 // for read-only access. 223 // The default column family name is 'default' and it's stored 224 // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName. 225 // If everything is OK, handles will on return be the same size 226 // as column_families --- handles[i] will be a handle that you 227 // will use to operate on column family column_family[i]. 228 // Before delete DB, you have to close All column families by calling 229 // DestroyColumnFamilyHandle() with all the handles. 230 static Status Open(const DBOptions& db_options, const std::string& name, 231 const std::vector<ColumnFamilyDescriptor>& column_families, 232 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr); 233 Resume()234 virtual Status Resume() { return Status::NotSupported(); } 235 236 // Close the DB by releasing resources, closing files etc. This should be 237 // called before calling the destructor so that the caller can get back a 238 // status in case there are any errors. This will not fsync the WAL files. 239 // If syncing is required, the caller must first call SyncWAL(), or Write() 240 // using an empty write batch with WriteOptions.sync=true. 241 // Regardless of the return status, the DB must be freed. 242 // If the return status is Aborted(), closing fails because there is 243 // unreleased snapshot in the system. In this case, users can release 244 // the unreleased snapshots and try again and expect it to succeed. For 245 // other status, recalling Close() will be no-op. 246 // If the return status is NotSupported(), then the DB implementation does 247 // cleanup in the destructor Close()248 virtual Status Close() { return Status::NotSupported(); } 249 250 // ListColumnFamilies will open the DB specified by argument name 251 // and return the list of all column families in that DB 252 // through column_families argument. The ordering of 253 // column families in column_families is unspecified. 254 static Status ListColumnFamilies(const DBOptions& db_options, 255 const std::string& name, 256 std::vector<std::string>* column_families); 257 DB()258 DB() {} 259 // No copying allowed 260 DB(const DB&) = delete; 261 void operator=(const DB&) = delete; 262 263 virtual ~DB(); 264 265 // Create a column_family and return the handle of column family 266 // through the argument handle. 267 virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, 268 const std::string& column_family_name, 269 ColumnFamilyHandle** handle); 270 271 // Bulk create column families with the same column family options. 272 // Return the handles of the column families through the argument handles. 273 // In case of error, the request may succeed partially, and handles will 274 // contain column family handles that it managed to create, and have size 275 // equal to the number of created column families. 276 virtual Status CreateColumnFamilies( 277 const ColumnFamilyOptions& options, 278 const std::vector<std::string>& column_family_names, 279 std::vector<ColumnFamilyHandle*>* handles); 280 281 // Bulk create column families. 282 // Return the handles of the column families through the argument handles. 283 // In case of error, the request may succeed partially, and handles will 284 // contain column family handles that it managed to create, and have size 285 // equal to the number of created column families. 286 virtual Status CreateColumnFamilies( 287 const std::vector<ColumnFamilyDescriptor>& column_families, 288 std::vector<ColumnFamilyHandle*>* handles); 289 290 // Drop a column family specified by column_family handle. This call 291 // only records a drop record in the manifest and prevents the column 292 // family from flushing and compacting. 293 virtual Status DropColumnFamily(ColumnFamilyHandle* column_family); 294 295 // Bulk drop column families. This call only records drop records in the 296 // manifest and prevents the column families from flushing and compacting. 297 // In case of error, the request may succeed partially. User may call 298 // ListColumnFamilies to check the result. 299 virtual Status DropColumnFamilies( 300 const std::vector<ColumnFamilyHandle*>& column_families); 301 302 // Close a column family specified by column_family handle and destroy 303 // the column family handle specified to avoid double deletion. This call 304 // deletes the column family handle by default. Use this method to 305 // close column family instead of deleting column family handle directly 306 virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family); 307 308 // Set the database entry for "key" to "value". 309 // If "key" already exists, it will be overwritten. 310 // Returns OK on success, and a non-OK status on error. 311 // Note: consider setting options.sync = true. 312 virtual Status Put(const WriteOptions& options, 313 ColumnFamilyHandle* column_family, const Slice& key, 314 const Slice& value) = 0; Put(const WriteOptions & options,const Slice & key,const Slice & value)315 virtual Status Put(const WriteOptions& options, const Slice& key, 316 const Slice& value) { 317 return Put(options, DefaultColumnFamily(), key, value); 318 } 319 320 // Remove the database entry (if any) for "key". Returns OK on 321 // success, and a non-OK status on error. It is not an error if "key" 322 // did not exist in the database. 323 // Note: consider setting options.sync = true. 324 virtual Status Delete(const WriteOptions& options, 325 ColumnFamilyHandle* column_family, 326 const Slice& key) = 0; Delete(const WriteOptions & options,const Slice & key)327 virtual Status Delete(const WriteOptions& options, const Slice& key) { 328 return Delete(options, DefaultColumnFamily(), key); 329 } 330 331 // Remove the database entry for "key". Requires that the key exists 332 // and was not overwritten. Returns OK on success, and a non-OK status 333 // on error. It is not an error if "key" did not exist in the database. 334 // 335 // If a key is overwritten (by calling Put() multiple times), then the result 336 // of calling SingleDelete() on this key is undefined. SingleDelete() only 337 // behaves correctly if there has been only one Put() for this key since the 338 // previous call to SingleDelete() for this key. 339 // 340 // This feature is currently an experimental performance optimization 341 // for a very specific workload. It is up to the caller to ensure that 342 // SingleDelete is only used for a key that is not deleted using Delete() or 343 // written using Merge(). Mixing SingleDelete operations with Deletes and 344 // Merges can result in undefined behavior. 345 // 346 // Note: consider setting options.sync = true. 347 virtual Status SingleDelete(const WriteOptions& options, 348 ColumnFamilyHandle* column_family, 349 const Slice& key) = 0; SingleDelete(const WriteOptions & options,const Slice & key)350 virtual Status SingleDelete(const WriteOptions& options, const Slice& key) { 351 return SingleDelete(options, DefaultColumnFamily(), key); 352 } 353 354 // Removes the database entries in the range ["begin_key", "end_key"), i.e., 355 // including "begin_key" and excluding "end_key". Returns OK on success, and 356 // a non-OK status on error. It is not an error if no keys exist in the range 357 // ["begin_key", "end_key"). 358 // 359 // This feature is now usable in production, with the following caveats: 360 // 1) Accumulating many range tombstones in the memtable will degrade read 361 // performance; this can be avoided by manually flushing occasionally. 362 // 2) Limiting the maximum number of open files in the presence of range 363 // tombstones can degrade read performance. To avoid this problem, set 364 // max_open_files to -1 whenever possible. 365 virtual Status DeleteRange(const WriteOptions& options, 366 ColumnFamilyHandle* column_family, 367 const Slice& begin_key, const Slice& end_key); 368 369 // Merge the database entry for "key" with "value". Returns OK on success, 370 // and a non-OK status on error. The semantics of this operation is 371 // determined by the user provided merge_operator when opening DB. 372 // Note: consider setting options.sync = true. 373 virtual Status Merge(const WriteOptions& options, 374 ColumnFamilyHandle* column_family, const Slice& key, 375 const Slice& value) = 0; Merge(const WriteOptions & options,const Slice & key,const Slice & value)376 virtual Status Merge(const WriteOptions& options, const Slice& key, 377 const Slice& value) { 378 return Merge(options, DefaultColumnFamily(), key, value); 379 } 380 381 // Apply the specified updates to the database. 382 // If `updates` contains no update, WAL will still be synced if 383 // options.sync=true. 384 // Returns OK on success, non-OK on failure. 385 // Note: consider setting options.sync = true. 386 virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; 387 388 // If the database contains an entry for "key" store the 389 // corresponding value in *value and return OK. 390 // 391 // If there is no entry for "key" leave *value unchanged and return 392 // a status for which Status::IsNotFound() returns true. 393 // 394 // May return some other Status on an error. Get(const ReadOptions & options,ColumnFamilyHandle * column_family,const Slice & key,std::string * value)395 virtual inline Status Get(const ReadOptions& options, 396 ColumnFamilyHandle* column_family, const Slice& key, 397 std::string* value) { 398 assert(value != nullptr); 399 PinnableSlice pinnable_val(value); 400 assert(!pinnable_val.IsPinned()); 401 auto s = Get(options, column_family, key, &pinnable_val); 402 if (s.ok() && pinnable_val.IsPinned()) { 403 value->assign(pinnable_val.data(), pinnable_val.size()); 404 } // else value is already assigned 405 return s; 406 } 407 virtual Status Get(const ReadOptions& options, 408 ColumnFamilyHandle* column_family, const Slice& key, 409 PinnableSlice* value) = 0; Get(const ReadOptions & options,const Slice & key,std::string * value)410 virtual Status Get(const ReadOptions& options, const Slice& key, 411 std::string* value) { 412 return Get(options, DefaultColumnFamily(), key, value); 413 } 414 415 // Returns all the merge operands corresponding to the key. If the 416 // number of merge operands in DB is greater than 417 // merge_operands_options.expected_max_number_of_operands 418 // no merge operands are returned and status is Incomplete. Merge operands 419 // returned are in the order of insertion. 420 // merge_operands- Points to an array of at-least 421 // merge_operands_options.expected_max_number_of_operands and the 422 // caller is responsible for allocating it. If the status 423 // returned is Incomplete then number_of_operands will contain 424 // the total number of merge operands found in DB for key. 425 virtual Status GetMergeOperands( 426 const ReadOptions& options, ColumnFamilyHandle* column_family, 427 const Slice& key, PinnableSlice* merge_operands, 428 GetMergeOperandsOptions* get_merge_operands_options, 429 int* number_of_operands) = 0; 430 431 // If keys[i] does not exist in the database, then the i'th returned 432 // status will be one for which Status::IsNotFound() is true, and 433 // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, 434 // the i'th returned status will have Status::ok() true, and (*values)[i] 435 // will store the value associated with keys[i]. 436 // 437 // (*values) will always be resized to be the same size as (keys). 438 // Similarly, the number of returned statuses will be the number of keys. 439 // Note: keys will not be "de-duplicated". Duplicate keys will return 440 // duplicate values in order. 441 virtual std::vector<Status> MultiGet( 442 const ReadOptions& options, 443 const std::vector<ColumnFamilyHandle*>& column_family, 444 const std::vector<Slice>& keys, std::vector<std::string>* values) = 0; MultiGet(const ReadOptions & options,const std::vector<Slice> & keys,std::vector<std::string> * values)445 virtual std::vector<Status> MultiGet(const ReadOptions& options, 446 const std::vector<Slice>& keys, 447 std::vector<std::string>* values) { 448 return MultiGet( 449 options, 450 std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()), 451 keys, values); 452 } 453 454 // Overloaded MultiGet API that improves performance by batching operations 455 // in the read path for greater efficiency. Currently, only the block based 456 // table format with full filters are supported. Other table formats such 457 // as plain table, block based table with block based filters and 458 // partitioned indexes will still work, but will not get any performance 459 // benefits. 460 // Parameters - 461 // options - ReadOptions 462 // column_family - ColumnFamilyHandle* that the keys belong to. All the keys 463 // passed to the API are restricted to a single column family 464 // num_keys - Number of keys to lookup 465 // keys - Pointer to C style array of key Slices with num_keys elements 466 // values - Pointer to C style array of PinnableSlices with num_keys elements 467 // statuses - Pointer to C style array of Status with num_keys elements 468 // sorted_input - If true, it means the input keys are already sorted by key 469 // order, so the MultiGet() API doesn't have to sort them 470 // again. If false, the keys will be copied and sorted 471 // internally by the API - the input array will not be 472 // modified 473 virtual void MultiGet(const ReadOptions& options, 474 ColumnFamilyHandle* column_family, 475 const size_t num_keys, const Slice* keys, 476 PinnableSlice* values, Status* statuses, 477 const bool /*sorted_input*/ = false) { 478 std::vector<ColumnFamilyHandle*> cf; 479 std::vector<Slice> user_keys; 480 std::vector<Status> status; 481 std::vector<std::string> vals; 482 483 for (size_t i = 0; i < num_keys; ++i) { 484 cf.emplace_back(column_family); 485 user_keys.emplace_back(keys[i]); 486 } 487 status = MultiGet(options, cf, user_keys, &vals); 488 std::copy(status.begin(), status.end(), statuses); 489 for (auto& value : vals) { 490 values->PinSelf(value); 491 values++; 492 } 493 } 494 495 // Overloaded MultiGet API that improves performance by batching operations 496 // in the read path for greater efficiency. Currently, only the block based 497 // table format with full filters are supported. Other table formats such 498 // as plain table, block based table with block based filters and 499 // partitioned indexes will still work, but will not get any performance 500 // benefits. 501 // Parameters - 502 // options - ReadOptions 503 // column_family - ColumnFamilyHandle* that the keys belong to. All the keys 504 // passed to the API are restricted to a single column family 505 // num_keys - Number of keys to lookup 506 // keys - Pointer to C style array of key Slices with num_keys elements 507 // values - Pointer to C style array of PinnableSlices with num_keys elements 508 // statuses - Pointer to C style array of Status with num_keys elements 509 // sorted_input - If true, it means the input keys are already sorted by key 510 // order, so the MultiGet() API doesn't have to sort them 511 // again. If false, the keys will be copied and sorted 512 // internally by the API - the input array will not be 513 // modified 514 virtual void MultiGet(const ReadOptions& options, const size_t num_keys, 515 ColumnFamilyHandle** column_families, const Slice* keys, 516 PinnableSlice* values, Status* statuses, 517 const bool /*sorted_input*/ = false) { 518 std::vector<ColumnFamilyHandle*> cf; 519 std::vector<Slice> user_keys; 520 std::vector<Status> status; 521 std::vector<std::string> vals; 522 523 for (size_t i = 0; i < num_keys; ++i) { 524 cf.emplace_back(column_families[i]); 525 user_keys.emplace_back(keys[i]); 526 } 527 status = MultiGet(options, cf, user_keys, &vals); 528 std::copy(status.begin(), status.end(), statuses); 529 for (auto& value : vals) { 530 values->PinSelf(value); 531 values++; 532 } 533 } 534 535 // If the key definitely does not exist in the database, then this method 536 // returns false, else true. If the caller wants to obtain value when the key 537 // is found in memory, a bool for 'value_found' must be passed. 'value_found' 538 // will be true on return if value has been set properly. 539 // This check is potentially lighter-weight than invoking DB::Get(). One way 540 // to make this lighter weight is to avoid doing any IOs. 541 // Default implementation here returns true and sets 'value_found' to false 542 virtual bool KeyMayExist(const ReadOptions& /*options*/, 543 ColumnFamilyHandle* /*column_family*/, 544 const Slice& /*key*/, std::string* /*value*/, 545 bool* value_found = nullptr) { 546 if (value_found != nullptr) { 547 *value_found = false; 548 } 549 return true; 550 } 551 virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, 552 std::string* value, bool* value_found = nullptr) { 553 return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found); 554 } 555 556 // Return a heap-allocated iterator over the contents of the database. 557 // The result of NewIterator() is initially invalid (caller must 558 // call one of the Seek methods on the iterator before using it). 559 // 560 // Caller should delete the iterator when it is no longer needed. 561 // The returned iterator should be deleted before this db is deleted. 562 virtual Iterator* NewIterator(const ReadOptions& options, 563 ColumnFamilyHandle* column_family) = 0; NewIterator(const ReadOptions & options)564 virtual Iterator* NewIterator(const ReadOptions& options) { 565 return NewIterator(options, DefaultColumnFamily()); 566 } 567 // Returns iterators from a consistent database state across multiple 568 // column families. Iterators are heap allocated and need to be deleted 569 // before the db is deleted 570 virtual Status NewIterators( 571 const ReadOptions& options, 572 const std::vector<ColumnFamilyHandle*>& column_families, 573 std::vector<Iterator*>* iterators) = 0; 574 575 // Return a handle to the current DB state. Iterators created with 576 // this handle will all observe a stable snapshot of the current DB 577 // state. The caller must call ReleaseSnapshot(result) when the 578 // snapshot is no longer needed. 579 // 580 // nullptr will be returned if the DB fails to take a snapshot or does 581 // not support snapshot. 582 virtual const Snapshot* GetSnapshot() = 0; 583 584 // Release a previously acquired snapshot. The caller must not 585 // use "snapshot" after this call. 586 virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; 587 588 #ifndef ROCKSDB_LITE 589 // Contains all valid property arguments for GetProperty(). 590 // 591 // NOTE: Property names cannot end in numbers since those are interpreted as 592 // arguments, e.g., see kNumFilesAtLevelPrefix. 593 struct Properties { 594 // "rocksdb.num-files-at-level<N>" - returns string containing the number 595 // of files at level <N>, where <N> is an ASCII representation of a 596 // level number (e.g., "0"). 597 static const std::string kNumFilesAtLevelPrefix; 598 599 // "rocksdb.compression-ratio-at-level<N>" - returns string containing the 600 // compression ratio of data at level <N>, where <N> is an ASCII 601 // representation of a level number (e.g., "0"). Here, compression 602 // ratio is defined as uncompressed data size / compressed file size. 603 // Returns "-1.0" if no open files at level <N>. 604 static const std::string kCompressionRatioAtLevelPrefix; 605 606 // "rocksdb.stats" - returns a multi-line string containing the data 607 // described by kCFStats followed by the data described by kDBStats. 608 static const std::string kStats; 609 610 // "rocksdb.sstables" - returns a multi-line string summarizing current 611 // SST files. 612 static const std::string kSSTables; 613 614 // "rocksdb.cfstats" - Both of "rocksdb.cfstats-no-file-histogram" and 615 // "rocksdb.cf-file-histogram" together. See below for description 616 // of the two. 617 static const std::string kCFStats; 618 619 // "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with 620 // general columm family stats per-level over db's lifetime ("L<n>"), 621 // aggregated over db's lifetime ("Sum"), and aggregated over the 622 // interval since the last retrieval ("Int"). 623 // It could also be used to return the stats in the format of the map. 624 // In this case there will a pair of string to array of double for 625 // each level as well as for "Sum". "Int" stats will not be affected 626 // when this form of stats are retrieved. 627 static const std::string kCFStatsNoFileHistogram; 628 629 // "rocksdb.cf-file-histogram" - print out how many file reads to every 630 // level, as well as the histogram of latency of single requests. 631 static const std::string kCFFileHistogram; 632 633 // "rocksdb.dbstats" - returns a multi-line string with general database 634 // stats, both cumulative (over the db's lifetime) and interval (since 635 // the last retrieval of kDBStats). 636 static const std::string kDBStats; 637 638 // "rocksdb.levelstats" - returns multi-line string containing the number 639 // of files per level and total size of each level (MB). 640 static const std::string kLevelStats; 641 642 // "rocksdb.num-immutable-mem-table" - returns number of immutable 643 // memtables that have not yet been flushed. 644 static const std::string kNumImmutableMemTable; 645 646 // "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable 647 // memtables that have already been flushed. 648 static const std::string kNumImmutableMemTableFlushed; 649 650 // "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is 651 // pending; otherwise, returns 0. 652 static const std::string kMemTableFlushPending; 653 654 // "rocksdb.num-running-flushes" - returns the number of currently running 655 // flushes. 656 static const std::string kNumRunningFlushes; 657 658 // "rocksdb.compaction-pending" - returns 1 if at least one compaction is 659 // pending; otherwise, returns 0. 660 static const std::string kCompactionPending; 661 662 // "rocksdb.num-running-compactions" - returns the number of currently 663 // running compactions. 664 static const std::string kNumRunningCompactions; 665 666 // "rocksdb.background-errors" - returns accumulated number of background 667 // errors. 668 static const std::string kBackgroundErrors; 669 670 // "rocksdb.cur-size-active-mem-table" - returns approximate size of active 671 // memtable (bytes). 672 static const std::string kCurSizeActiveMemTable; 673 674 // "rocksdb.cur-size-all-mem-tables" - returns approximate size of active 675 // and unflushed immutable memtables (bytes). 676 static const std::string kCurSizeAllMemTables; 677 678 // "rocksdb.size-all-mem-tables" - returns approximate size of active, 679 // unflushed immutable, and pinned immutable memtables (bytes). 680 static const std::string kSizeAllMemTables; 681 682 // "rocksdb.num-entries-active-mem-table" - returns total number of entries 683 // in the active memtable. 684 static const std::string kNumEntriesActiveMemTable; 685 686 // "rocksdb.num-entries-imm-mem-tables" - returns total number of entries 687 // in the unflushed immutable memtables. 688 static const std::string kNumEntriesImmMemTables; 689 690 // "rocksdb.num-deletes-active-mem-table" - returns total number of delete 691 // entries in the active memtable. 692 static const std::string kNumDeletesActiveMemTable; 693 694 // "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete 695 // entries in the unflushed immutable memtables. 696 static const std::string kNumDeletesImmMemTables; 697 698 // "rocksdb.estimate-num-keys" - returns estimated number of total keys in 699 // the active and unflushed immutable memtables and storage. 700 static const std::string kEstimateNumKeys; 701 702 // "rocksdb.estimate-table-readers-mem" - returns estimated memory used for 703 // reading SST tables, excluding memory used in block cache (e.g., 704 // filter and index blocks). 705 static const std::string kEstimateTableReadersMem; 706 707 // "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete 708 // files is enabled; otherwise, returns a non-zero number. 709 static const std::string kIsFileDeletionsEnabled; 710 711 // "rocksdb.num-snapshots" - returns number of unreleased snapshots of the 712 // database. 713 static const std::string kNumSnapshots; 714 715 // "rocksdb.oldest-snapshot-time" - returns number representing unix 716 // timestamp of oldest unreleased snapshot. 717 static const std::string kOldestSnapshotTime; 718 719 // "rocksdb.oldest-snapshot-sequence" - returns number representing 720 // sequence number of oldest unreleased snapshot. 721 static const std::string kOldestSnapshotSequence; 722 723 // "rocksdb.num-live-versions" - returns number of live versions. `Version` 724 // is an internal data structure. See version_set.h for details. More 725 // live versions often mean more SST files are held from being deleted, 726 // by iterators or unfinished compactions. 727 static const std::string kNumLiveVersions; 728 729 // "rocksdb.current-super-version-number" - returns number of current LSM 730 // version. It is a uint64_t integer number, incremented after there is 731 // any change to the LSM tree. The number is not preserved after restarting 732 // the DB. After DB restart, it will start from 0 again. 733 static const std::string kCurrentSuperVersionNumber; 734 735 // "rocksdb.estimate-live-data-size" - returns an estimate of the amount of 736 // live data in bytes. 737 static const std::string kEstimateLiveDataSize; 738 739 // "rocksdb.min-log-number-to-keep" - return the minimum log number of the 740 // log files that should be kept. 741 static const std::string kMinLogNumberToKeep; 742 743 // "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file 744 // number for an obsolete SST to be kept. The max value of `uint64_t` 745 // will be returned if all obsolete files can be deleted. 746 static const std::string kMinObsoleteSstNumberToKeep; 747 748 // "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST 749 // files. 750 // WARNING: may slow down online queries if there are too many files. 751 static const std::string kTotalSstFilesSize; 752 753 // "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST 754 // files belong to the latest LSM tree. 755 static const std::string kLiveSstFilesSize; 756 757 // "rocksdb.base-level" - returns number of level to which L0 data will be 758 // compacted. 759 static const std::string kBaseLevel; 760 761 // "rocksdb.estimate-pending-compaction-bytes" - returns estimated total 762 // number of bytes compaction needs to rewrite to get all levels down 763 // to under target size. Not valid for other compactions than level- 764 // based. 765 static const std::string kEstimatePendingCompactionBytes; 766 767 // "rocksdb.aggregated-table-properties" - returns a string representation 768 // of the aggregated table properties of the target column family. 769 static const std::string kAggregatedTableProperties; 770 771 // "rocksdb.aggregated-table-properties-at-level<N>", same as the previous 772 // one but only returns the aggregated table properties of the 773 // specified level "N" at the target column family. 774 static const std::string kAggregatedTablePropertiesAtLevel; 775 776 // "rocksdb.actual-delayed-write-rate" - returns the current actual delayed 777 // write rate. 0 means no delay. 778 static const std::string kActualDelayedWriteRate; 779 780 // "rocksdb.is-write-stopped" - Return 1 if write has been stopped. 781 static const std::string kIsWriteStopped; 782 783 // "rocksdb.estimate-oldest-key-time" - returns an estimation of 784 // oldest key timestamp in the DB. Currently only available for 785 // FIFO compaction with 786 // compaction_options_fifo.allow_compaction = false. 787 static const std::string kEstimateOldestKeyTime; 788 789 // "rocksdb.block-cache-capacity" - returns block cache capacity. 790 static const std::string kBlockCacheCapacity; 791 792 // "rocksdb.block-cache-usage" - returns the memory size for the entries 793 // residing in block cache. 794 static const std::string kBlockCacheUsage; 795 796 // "rocksdb.block-cache-pinned-usage" - returns the memory size for the 797 // entries being pinned. 798 static const std::string kBlockCachePinnedUsage; 799 800 // "rocksdb.options-statistics" - returns multi-line string 801 // of options.statistics 802 static const std::string kOptionsStatistics; 803 }; 804 #endif /* ROCKSDB_LITE */ 805 806 // DB implementations can export properties about their state via this method. 807 // If "property" is a valid property understood by this DB implementation (see 808 // Properties struct above for valid options), fills "*value" with its current 809 // value and returns true. Otherwise, returns false. 810 virtual bool GetProperty(ColumnFamilyHandle* column_family, 811 const Slice& property, std::string* value) = 0; GetProperty(const Slice & property,std::string * value)812 virtual bool GetProperty(const Slice& property, std::string* value) { 813 return GetProperty(DefaultColumnFamily(), property, value); 814 } 815 virtual bool GetMapProperty(ColumnFamilyHandle* column_family, 816 const Slice& property, 817 std::map<std::string, std::string>* value) = 0; GetMapProperty(const Slice & property,std::map<std::string,std::string> * value)818 virtual bool GetMapProperty(const Slice& property, 819 std::map<std::string, std::string>* value) { 820 return GetMapProperty(DefaultColumnFamily(), property, value); 821 } 822 823 // Similar to GetProperty(), but only works for a subset of properties whose 824 // return value is an integer. Return the value by integer. Supported 825 // properties: 826 // "rocksdb.num-immutable-mem-table" 827 // "rocksdb.mem-table-flush-pending" 828 // "rocksdb.compaction-pending" 829 // "rocksdb.background-errors" 830 // "rocksdb.cur-size-active-mem-table" 831 // "rocksdb.cur-size-all-mem-tables" 832 // "rocksdb.size-all-mem-tables" 833 // "rocksdb.num-entries-active-mem-table" 834 // "rocksdb.num-entries-imm-mem-tables" 835 // "rocksdb.num-deletes-active-mem-table" 836 // "rocksdb.num-deletes-imm-mem-tables" 837 // "rocksdb.estimate-num-keys" 838 // "rocksdb.estimate-table-readers-mem" 839 // "rocksdb.is-file-deletions-enabled" 840 // "rocksdb.num-snapshots" 841 // "rocksdb.oldest-snapshot-time" 842 // "rocksdb.num-live-versions" 843 // "rocksdb.current-super-version-number" 844 // "rocksdb.estimate-live-data-size" 845 // "rocksdb.min-log-number-to-keep" 846 // "rocksdb.min-obsolete-sst-number-to-keep" 847 // "rocksdb.total-sst-files-size" 848 // "rocksdb.live-sst-files-size" 849 // "rocksdb.base-level" 850 // "rocksdb.estimate-pending-compaction-bytes" 851 // "rocksdb.num-running-compactions" 852 // "rocksdb.num-running-flushes" 853 // "rocksdb.actual-delayed-write-rate" 854 // "rocksdb.is-write-stopped" 855 // "rocksdb.estimate-oldest-key-time" 856 // "rocksdb.block-cache-capacity" 857 // "rocksdb.block-cache-usage" 858 // "rocksdb.block-cache-pinned-usage" 859 virtual bool GetIntProperty(ColumnFamilyHandle* column_family, 860 const Slice& property, uint64_t* value) = 0; GetIntProperty(const Slice & property,uint64_t * value)861 virtual bool GetIntProperty(const Slice& property, uint64_t* value) { 862 return GetIntProperty(DefaultColumnFamily(), property, value); 863 } 864 865 // Reset internal stats for DB and all column families. 866 // Note this doesn't reset options.statistics as it is not owned by 867 // DB. ResetStats()868 virtual Status ResetStats() { 869 return Status::NotSupported("Not implemented"); 870 } 871 872 // Same as GetIntProperty(), but this one returns the aggregated int 873 // property from all column families. 874 virtual bool GetAggregatedIntProperty(const Slice& property, 875 uint64_t* value) = 0; 876 877 // Flags for DB::GetSizeApproximation that specify whether memtable 878 // stats should be included, or file stats approximation or both 879 enum SizeApproximationFlags : uint8_t { 880 NONE = 0, 881 INCLUDE_MEMTABLES = 1 << 0, 882 INCLUDE_FILES = 1 << 1 883 }; 884 885 // For each i in [0,n-1], store in "sizes[i]", the approximate 886 // file system space used by keys in "[range[i].start .. range[i].limit)". 887 // 888 // Note that the returned sizes measure file system space usage, so 889 // if the user data compresses by a factor of ten, the returned 890 // sizes will be one-tenth the size of the corresponding user data size. 891 virtual Status GetApproximateSizes(const SizeApproximationOptions& options, 892 ColumnFamilyHandle* column_family, 893 const Range* range, int n, 894 uint64_t* sizes) = 0; 895 896 // Simpler versions of the GetApproximateSizes() method above. 897 // The include_flags argumenbt must of type DB::SizeApproximationFlags 898 // and can not be NONE. 899 virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, 900 const Range* range, int n, uint64_t* sizes, 901 uint8_t include_flags = INCLUDE_FILES) { 902 SizeApproximationOptions options; 903 options.include_memtabtles = 904 (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0; 905 options.include_files = 906 (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0; 907 GetApproximateSizes(options, column_family, range, n, sizes); 908 } 909 virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes, 910 uint8_t include_flags = INCLUDE_FILES) { 911 GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags); 912 } 913 914 // The method is similar to GetApproximateSizes, except it 915 // returns approximate number of records in memtables. 916 virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, 917 const Range& range, 918 uint64_t* const count, 919 uint64_t* const size) = 0; GetApproximateMemTableStats(const Range & range,uint64_t * const count,uint64_t * const size)920 virtual void GetApproximateMemTableStats(const Range& range, 921 uint64_t* const count, 922 uint64_t* const size) { 923 GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size); 924 } 925 926 // Deprecated versions of GetApproximateSizes GetApproximateSizes(const Range * range,int n,uint64_t * sizes,bool include_memtable)927 ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes( 928 const Range* range, int n, uint64_t* sizes, bool include_memtable) { 929 uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES; 930 if (include_memtable) { 931 include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES; 932 } 933 GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags); 934 } GetApproximateSizes(ColumnFamilyHandle * column_family,const Range * range,int n,uint64_t * sizes,bool include_memtable)935 ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes( 936 ColumnFamilyHandle* column_family, const Range* range, int n, 937 uint64_t* sizes, bool include_memtable) { 938 uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES; 939 if (include_memtable) { 940 include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES; 941 } 942 GetApproximateSizes(column_family, range, n, sizes, include_flags); 943 } 944 945 // Compact the underlying storage for the key range [*begin,*end]. 946 // The actual compaction interval might be superset of [*begin, *end]. 947 // In particular, deleted and overwritten versions are discarded, 948 // and the data is rearranged to reduce the cost of operations 949 // needed to access the data. This operation should typically only 950 // be invoked by users who understand the underlying implementation. 951 // 952 // begin==nullptr is treated as a key before all keys in the database. 953 // end==nullptr is treated as a key after all keys in the database. 954 // Therefore the following call will compact the entire database: 955 // db->CompactRange(options, nullptr, nullptr); 956 // Note that after the entire database is compacted, all data are pushed 957 // down to the last level containing any data. If the total data size after 958 // compaction is reduced, that level might not be appropriate for hosting all 959 // the files. In this case, client could set options.change_level to true, to 960 // move the files back to the minimum level capable of holding the data set 961 // or a given level (specified by non-negative options.target_level). 962 virtual Status CompactRange(const CompactRangeOptions& options, 963 ColumnFamilyHandle* column_family, 964 const Slice* begin, const Slice* end) = 0; CompactRange(const CompactRangeOptions & options,const Slice * begin,const Slice * end)965 virtual Status CompactRange(const CompactRangeOptions& options, 966 const Slice* begin, const Slice* end) { 967 return CompactRange(options, DefaultColumnFamily(), begin, end); 968 } 969 970 ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange( 971 ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end, 972 bool change_level = false, int target_level = -1, 973 uint32_t target_path_id = 0) { 974 CompactRangeOptions options; 975 options.change_level = change_level; 976 options.target_level = target_level; 977 options.target_path_id = target_path_id; 978 return CompactRange(options, column_family, begin, end); 979 } 980 981 ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange( 982 const Slice* begin, const Slice* end, bool change_level = false, 983 int target_level = -1, uint32_t target_path_id = 0) { 984 CompactRangeOptions options; 985 options.change_level = change_level; 986 options.target_level = target_level; 987 options.target_path_id = target_path_id; 988 return CompactRange(options, DefaultColumnFamily(), begin, end); 989 } 990 SetOptions(ColumnFamilyHandle *,const std::unordered_map<std::string,std::string> &)991 virtual Status SetOptions( 992 ColumnFamilyHandle* /*column_family*/, 993 const std::unordered_map<std::string, std::string>& /*new_options*/) { 994 return Status::NotSupported("Not implemented"); 995 } SetOptions(const std::unordered_map<std::string,std::string> & new_options)996 virtual Status SetOptions( 997 const std::unordered_map<std::string, std::string>& new_options) { 998 return SetOptions(DefaultColumnFamily(), new_options); 999 } 1000 1001 virtual Status SetDBOptions( 1002 const std::unordered_map<std::string, std::string>& new_options) = 0; 1003 1004 // CompactFiles() inputs a list of files specified by file numbers and 1005 // compacts them to the specified level. Note that the behavior is different 1006 // from CompactRange() in that CompactFiles() performs the compaction job 1007 // using the CURRENT thread. 1008 // 1009 // @see GetDataBaseMetaData 1010 // @see GetColumnFamilyMetaData 1011 virtual Status CompactFiles( 1012 const CompactionOptions& compact_options, 1013 ColumnFamilyHandle* column_family, 1014 const std::vector<std::string>& input_file_names, const int output_level, 1015 const int output_path_id = -1, 1016 std::vector<std::string>* const output_file_names = nullptr, 1017 CompactionJobInfo* compaction_job_info = nullptr) = 0; 1018 1019 virtual Status CompactFiles( 1020 const CompactionOptions& compact_options, 1021 const std::vector<std::string>& input_file_names, const int output_level, 1022 const int output_path_id = -1, 1023 std::vector<std::string>* const output_file_names = nullptr, 1024 CompactionJobInfo* compaction_job_info = nullptr) { 1025 return CompactFiles(compact_options, DefaultColumnFamily(), 1026 input_file_names, output_level, output_path_id, 1027 output_file_names, compaction_job_info); 1028 } 1029 1030 // This function will wait until all currently running background processes 1031 // finish. After it returns, no background process will be run until 1032 // ContinueBackgroundWork is called 1033 virtual Status PauseBackgroundWork() = 0; 1034 virtual Status ContinueBackgroundWork() = 0; 1035 1036 // This function will enable automatic compactions for the given column 1037 // families if they were previously disabled. The function will first set the 1038 // disable_auto_compactions option for each column family to 'false', after 1039 // which it will schedule a flush/compaction. 1040 // 1041 // NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API 1042 // does NOT schedule a flush/compaction afterwards, and only changes the 1043 // parameter itself within the column family option. 1044 // 1045 virtual Status EnableAutoCompaction( 1046 const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0; 1047 1048 virtual void DisableManualCompaction() = 0; 1049 virtual void EnableManualCompaction() = 0; 1050 1051 // Number of levels used for this DB. 1052 virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; NumberLevels()1053 virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); } 1054 1055 // Maximum level to which a new compacted memtable is pushed if it 1056 // does not create overlap. 1057 virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0; MaxMemCompactionLevel()1058 virtual int MaxMemCompactionLevel() { 1059 return MaxMemCompactionLevel(DefaultColumnFamily()); 1060 } 1061 1062 // Number of files in level-0 that would stop writes. 1063 virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0; Level0StopWriteTrigger()1064 virtual int Level0StopWriteTrigger() { 1065 return Level0StopWriteTrigger(DefaultColumnFamily()); 1066 } 1067 1068 // Get DB name -- the exact same name that was provided as an argument to 1069 // DB::Open() 1070 virtual const std::string& GetName() const = 0; 1071 1072 // Get Env object from the DB 1073 virtual Env* GetEnv() const = 0; 1074 1075 virtual FileSystem* GetFileSystem() const; 1076 1077 // Get DB Options that we use. During the process of opening the 1078 // column family, the options provided when calling DB::Open() or 1079 // DB::CreateColumnFamily() will have been "sanitized" and transformed 1080 // in an implementation-defined manner. 1081 virtual Options GetOptions(ColumnFamilyHandle* column_family) const = 0; GetOptions()1082 virtual Options GetOptions() const { 1083 return GetOptions(DefaultColumnFamily()); 1084 } 1085 1086 virtual DBOptions GetDBOptions() const = 0; 1087 1088 // Flush all mem-table data. 1089 // Flush a single column family, even when atomic flush is enabled. To flush 1090 // multiple column families, use Flush(options, column_families). 1091 virtual Status Flush(const FlushOptions& options, 1092 ColumnFamilyHandle* column_family) = 0; Flush(const FlushOptions & options)1093 virtual Status Flush(const FlushOptions& options) { 1094 return Flush(options, DefaultColumnFamily()); 1095 } 1096 // Flushes multiple column families. 1097 // If atomic flush is not enabled, Flush(options, column_families) is 1098 // equivalent to calling Flush(options, column_family) multiple times. 1099 // If atomic flush is enabled, Flush(options, column_families) will flush all 1100 // column families specified in 'column_families' up to the latest sequence 1101 // number at the time when flush is requested. 1102 // Note that RocksDB 5.15 and earlier may not be able to open later versions 1103 // with atomic flush enabled. 1104 virtual Status Flush( 1105 const FlushOptions& options, 1106 const std::vector<ColumnFamilyHandle*>& column_families) = 0; 1107 1108 // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL 1109 // afterwards. FlushWAL(bool)1110 virtual Status FlushWAL(bool /*sync*/) { 1111 return Status::NotSupported("FlushWAL not implemented"); 1112 } 1113 // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the 1114 // same as Write() with sync=true: in the latter case the changes won't be 1115 // visible until the sync is done. 1116 // Currently only works if allow_mmap_writes = false in Options. 1117 virtual Status SyncWAL() = 0; 1118 1119 // Lock the WAL. Also flushes the WAL after locking. LockWAL()1120 virtual Status LockWAL() { 1121 return Status::NotSupported("LockWAL not implemented"); 1122 } 1123 1124 // Unlock the WAL. UnlockWAL()1125 virtual Status UnlockWAL() { 1126 return Status::NotSupported("UnlockWAL not implemented"); 1127 } 1128 1129 // The sequence number of the most recent transaction. 1130 virtual SequenceNumber GetLatestSequenceNumber() const = 0; 1131 1132 // Instructs DB to preserve deletes with sequence numbers >= passed seqnum. 1133 // Has no effect if DBOptions.preserve_deletes is set to false. 1134 // This function assumes that user calls this function with monotonically 1135 // increasing seqnums (otherwise we can't guarantee that a particular delete 1136 // hasn't been already processed); returns true if the value was successfully 1137 // updated, false if user attempted to call if with seqnum <= current value. 1138 virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) = 0; 1139 1140 #ifndef ROCKSDB_LITE 1141 1142 // Prevent file deletions. Compactions will continue to occur, 1143 // but no obsolete files will be deleted. Calling this multiple 1144 // times have the same effect as calling it once. 1145 virtual Status DisableFileDeletions() = 0; 1146 1147 // Allow compactions to delete obsolete files. 1148 // If force == true, the call to EnableFileDeletions() will guarantee that 1149 // file deletions are enabled after the call, even if DisableFileDeletions() 1150 // was called multiple times before. 1151 // If force == false, EnableFileDeletions will only enable file deletion 1152 // after it's been called at least as many times as DisableFileDeletions(), 1153 // enabling the two methods to be called by two threads concurrently without 1154 // synchronization -- i.e., file deletions will be enabled only after both 1155 // threads call EnableFileDeletions() 1156 virtual Status EnableFileDeletions(bool force = true) = 0; 1157 1158 // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup 1159 1160 // Retrieve the list of all files in the database. The files are 1161 // relative to the dbname and are not absolute paths. Despite being relative 1162 // paths, the file names begin with "/". The valid size of the manifest file 1163 // is returned in manifest_file_size. The manifest file is an ever growing 1164 // file, but only the portion specified by manifest_file_size is valid for 1165 // this snapshot. Setting flush_memtable to true does Flush before recording 1166 // the live files. Setting flush_memtable to false is useful when we don't 1167 // want to wait for flush which may have to wait for compaction to complete 1168 // taking an indeterminate time. 1169 // 1170 // In case you have multiple column families, even if flush_memtable is true, 1171 // you still need to call GetSortedWalFiles after GetLiveFiles to compensate 1172 // for new data that arrived to already-flushed column families while other 1173 // column families were flushing 1174 virtual Status GetLiveFiles(std::vector<std::string>&, 1175 uint64_t* manifest_file_size, 1176 bool flush_memtable = true) = 0; 1177 1178 // Retrieve the sorted list of all wal files with earliest file first 1179 virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0; 1180 1181 // Retrieve information about the current wal file 1182 // 1183 // Note that the log might have rolled after this call in which case 1184 // the current_log_file would not point to the current log file. 1185 // 1186 // Additionally, for the sake of optimization current_log_file->StartSequence 1187 // would always be set to 0 1188 virtual Status GetCurrentWalFile( 1189 std::unique_ptr<LogFile>* current_log_file) = 0; 1190 1191 // Retrieves the creation time of the oldest file in the DB. 1192 // This API only works if max_open_files = -1, if it is not then 1193 // Status returned is Status::NotSupported() 1194 // The file creation time is set using the env provided to the DB. 1195 // If the DB was created from a very old release then its possible that 1196 // the SST files might not have file_creation_time property and even after 1197 // moving to a newer release its possible that some files never got compacted 1198 // and may not have file_creation_time property. In both the cases 1199 // file_creation_time is considered 0 which means this API will return 1200 // creation_time = 0 as there wouldn't be a timestamp lower than 0. 1201 virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0; 1202 1203 // Note: this API is not yet consistent with WritePrepared transactions. 1204 // Sets iter to an iterator that is positioned at a write-batch containing 1205 // seq_number. If the sequence number is non existent, it returns an iterator 1206 // at the first available seq_no after the requested seq_no 1207 // Returns Status::OK if iterator is valid 1208 // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to 1209 // use this api, else the WAL files will get 1210 // cleared aggressively and the iterator might keep getting invalid before 1211 // an update is read. 1212 virtual Status GetUpdatesSince( 1213 SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter, 1214 const TransactionLogIterator::ReadOptions& read_options = 1215 TransactionLogIterator::ReadOptions()) = 0; 1216 1217 // Windows API macro interference 1218 #undef DeleteFile 1219 // Delete the file name from the db directory and update the internal state to 1220 // reflect that. Supports deletion of sst and log files only. 'name' must be 1221 // path relative to the db directory. eg. 000001.sst, /archive/000003.log 1222 virtual Status DeleteFile(std::string name) = 0; 1223 1224 // Returns a list of all table files with their level, start key 1225 // and end key GetLiveFilesMetaData(std::vector<LiveFileMetaData> *)1226 virtual void GetLiveFilesMetaData( 1227 std::vector<LiveFileMetaData>* /*metadata*/) {} 1228 1229 // Obtains the meta data of the specified column family of the DB. GetColumnFamilyMetaData(ColumnFamilyHandle *,ColumnFamilyMetaData *)1230 virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, 1231 ColumnFamilyMetaData* /*metadata*/) {} 1232 1233 // Get the metadata of the default column family. GetColumnFamilyMetaData(ColumnFamilyMetaData * metadata)1234 void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) { 1235 GetColumnFamilyMetaData(DefaultColumnFamily(), metadata); 1236 } 1237 1238 // IngestExternalFile() will load a list of external SST files (1) into the DB 1239 // Two primary modes are supported: 1240 // - Duplicate keys in the new files will overwrite exiting keys (default) 1241 // - Duplicate keys will be skipped (set ingest_behind=true) 1242 // In the first mode we will try to find the lowest possible level that 1243 // the file can fit in, and ingest the file into this level (2). A file that 1244 // have a key range that overlap with the memtable key range will require us 1245 // to Flush the memtable first before ingesting the file. 1246 // In the second mode we will always ingest in the bottom most level (see 1247 // docs to IngestExternalFileOptions::ingest_behind). 1248 // 1249 // (1) External SST files can be created using SstFileWriter 1250 // (2) We will try to ingest the files to the lowest possible level 1251 // even if the file compression doesn't match the level compression 1252 // (3) If IngestExternalFileOptions->ingest_behind is set to true, 1253 // we always ingest at the bottommost level, which should be reserved 1254 // for this purpose (see DBOPtions::allow_ingest_behind flag). 1255 virtual Status IngestExternalFile( 1256 ColumnFamilyHandle* column_family, 1257 const std::vector<std::string>& external_files, 1258 const IngestExternalFileOptions& options) = 0; 1259 IngestExternalFile(const std::vector<std::string> & external_files,const IngestExternalFileOptions & options)1260 virtual Status IngestExternalFile( 1261 const std::vector<std::string>& external_files, 1262 const IngestExternalFileOptions& options) { 1263 return IngestExternalFile(DefaultColumnFamily(), external_files, options); 1264 } 1265 1266 // IngestExternalFiles() will ingest files for multiple column families, and 1267 // record the result atomically to the MANIFEST. 1268 // If this function returns OK, all column families' ingestion must succeed. 1269 // If this function returns NOK, or the process crashes, then non-of the 1270 // files will be ingested into the database after recovery. 1271 // Note that it is possible for application to observe a mixed state during 1272 // the execution of this function. If the user performs range scan over the 1273 // column families with iterators, iterator on one column family may return 1274 // ingested data, while iterator on other column family returns old data. 1275 // Users can use snapshot for a consistent view of data. 1276 // If your db ingests multiple SST files using this API, i.e. args.size() 1277 // > 1, then RocksDB 5.15 and earlier will not be able to open it. 1278 // 1279 // REQUIRES: each arg corresponds to a different column family: namely, for 1280 // 0 <= i < j < len(args), args[i].column_family != args[j].column_family. 1281 virtual Status IngestExternalFiles( 1282 const std::vector<IngestExternalFileArg>& args) = 0; 1283 1284 // CreateColumnFamilyWithImport() will create a new column family with 1285 // column_family_name and import external SST files specified in metadata into 1286 // this column family. 1287 // (1) External SST files can be created using SstFileWriter. 1288 // (2) External SST files can be exported from a particular column family in 1289 // an existing DB. 1290 // Option in import_options specifies whether the external files are copied or 1291 // moved (default is copy). When option specifies copy, managing files at 1292 // external_file_path is caller's responsibility. When option specifies a 1293 // move, the call ensures that the specified files at external_file_path are 1294 // deleted on successful return and files are not modified on any error 1295 // return. 1296 // On error return, column family handle returned will be nullptr. 1297 // ColumnFamily will be present on successful return and will not be present 1298 // on error return. ColumnFamily may be present on any crash during this call. 1299 virtual Status CreateColumnFamilyWithImport( 1300 const ColumnFamilyOptions& options, const std::string& column_family_name, 1301 const ImportColumnFamilyOptions& import_options, 1302 const ExportImportFilesMetaData& metadata, 1303 ColumnFamilyHandle** handle) = 0; 1304 1305 virtual Status VerifyChecksum(const ReadOptions& read_options) = 0; 1306 VerifyChecksum()1307 virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } 1308 1309 // AddFile() is deprecated, please use IngestExternalFile() 1310 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( 1311 ColumnFamilyHandle* column_family, 1312 const std::vector<std::string>& file_path_list, bool move_file = false, 1313 bool skip_snapshot_check = false) { 1314 IngestExternalFileOptions ifo; 1315 ifo.move_files = move_file; 1316 ifo.snapshot_consistency = !skip_snapshot_check; 1317 ifo.allow_global_seqno = false; 1318 ifo.allow_blocking_flush = false; 1319 return IngestExternalFile(column_family, file_path_list, ifo); 1320 } 1321 1322 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( 1323 const std::vector<std::string>& file_path_list, bool move_file = false, 1324 bool skip_snapshot_check = false) { 1325 IngestExternalFileOptions ifo; 1326 ifo.move_files = move_file; 1327 ifo.snapshot_consistency = !skip_snapshot_check; 1328 ifo.allow_global_seqno = false; 1329 ifo.allow_blocking_flush = false; 1330 return IngestExternalFile(DefaultColumnFamily(), file_path_list, ifo); 1331 } 1332 1333 // AddFile() is deprecated, please use IngestExternalFile() 1334 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( 1335 ColumnFamilyHandle* column_family, const std::string& file_path, 1336 bool move_file = false, bool skip_snapshot_check = false) { 1337 IngestExternalFileOptions ifo; 1338 ifo.move_files = move_file; 1339 ifo.snapshot_consistency = !skip_snapshot_check; 1340 ifo.allow_global_seqno = false; 1341 ifo.allow_blocking_flush = false; 1342 return IngestExternalFile(column_family, {file_path}, ifo); 1343 } 1344 1345 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( 1346 const std::string& file_path, bool move_file = false, 1347 bool skip_snapshot_check = false) { 1348 IngestExternalFileOptions ifo; 1349 ifo.move_files = move_file; 1350 ifo.snapshot_consistency = !skip_snapshot_check; 1351 ifo.allow_global_seqno = false; 1352 ifo.allow_blocking_flush = false; 1353 return IngestExternalFile(DefaultColumnFamily(), {file_path}, ifo); 1354 } 1355 1356 // Load table file with information "file_info" into "column_family" 1357 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( 1358 ColumnFamilyHandle* column_family, 1359 const std::vector<ExternalSstFileInfo>& file_info_list, 1360 bool move_file = false, bool skip_snapshot_check = false) { 1361 std::vector<std::string> external_files; 1362 for (const ExternalSstFileInfo& file_info : file_info_list) { 1363 external_files.push_back(file_info.file_path); 1364 } 1365 IngestExternalFileOptions ifo; 1366 ifo.move_files = move_file; 1367 ifo.snapshot_consistency = !skip_snapshot_check; 1368 ifo.allow_global_seqno = false; 1369 ifo.allow_blocking_flush = false; 1370 return IngestExternalFile(column_family, external_files, ifo); 1371 } 1372 1373 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( 1374 const std::vector<ExternalSstFileInfo>& file_info_list, 1375 bool move_file = false, bool skip_snapshot_check = false) { 1376 std::vector<std::string> external_files; 1377 for (const ExternalSstFileInfo& file_info : file_info_list) { 1378 external_files.push_back(file_info.file_path); 1379 } 1380 IngestExternalFileOptions ifo; 1381 ifo.move_files = move_file; 1382 ifo.snapshot_consistency = !skip_snapshot_check; 1383 ifo.allow_global_seqno = false; 1384 ifo.allow_blocking_flush = false; 1385 return IngestExternalFile(DefaultColumnFamily(), external_files, ifo); 1386 } 1387 1388 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( 1389 ColumnFamilyHandle* column_family, const ExternalSstFileInfo* file_info, 1390 bool move_file = false, bool skip_snapshot_check = false) { 1391 IngestExternalFileOptions ifo; 1392 ifo.move_files = move_file; 1393 ifo.snapshot_consistency = !skip_snapshot_check; 1394 ifo.allow_global_seqno = false; 1395 ifo.allow_blocking_flush = false; 1396 return IngestExternalFile(column_family, {file_info->file_path}, ifo); 1397 } 1398 1399 ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( 1400 const ExternalSstFileInfo* file_info, bool move_file = false, 1401 bool skip_snapshot_check = false) { 1402 IngestExternalFileOptions ifo; 1403 ifo.move_files = move_file; 1404 ifo.snapshot_consistency = !skip_snapshot_check; 1405 ifo.allow_global_seqno = false; 1406 ifo.allow_blocking_flush = false; 1407 return IngestExternalFile(DefaultColumnFamily(), {file_info->file_path}, 1408 ifo); 1409 } 1410 1411 #endif // ROCKSDB_LITE 1412 1413 // Returns the unique ID which is read from IDENTITY file during the opening 1414 // of database by setting in the identity variable 1415 // Returns Status::OK if identity could be set properly 1416 virtual Status GetDbIdentity(std::string& identity) const = 0; 1417 1418 // Returns default column family handle 1419 virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0; 1420 1421 #ifndef ROCKSDB_LITE 1422 virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, 1423 TablePropertiesCollection* props) = 0; GetPropertiesOfAllTables(TablePropertiesCollection * props)1424 virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { 1425 return GetPropertiesOfAllTables(DefaultColumnFamily(), props); 1426 } 1427 virtual Status GetPropertiesOfTablesInRange( 1428 ColumnFamilyHandle* column_family, const Range* range, std::size_t n, 1429 TablePropertiesCollection* props) = 0; 1430 SuggestCompactRange(ColumnFamilyHandle *,const Slice *,const Slice *)1431 virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/, 1432 const Slice* /*begin*/, 1433 const Slice* /*end*/) { 1434 return Status::NotSupported("SuggestCompactRange() is not implemented."); 1435 } 1436 PromoteL0(ColumnFamilyHandle *,int)1437 virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/, 1438 int /*target_level*/) { 1439 return Status::NotSupported("PromoteL0() is not implemented."); 1440 } 1441 1442 // Trace DB operations. Use EndTrace() to stop tracing. StartTrace(const TraceOptions &,std::unique_ptr<TraceWriter> &&)1443 virtual Status StartTrace(const TraceOptions& /*options*/, 1444 std::unique_ptr<TraceWriter>&& /*trace_writer*/) { 1445 return Status::NotSupported("StartTrace() is not implemented."); 1446 } 1447 EndTrace()1448 virtual Status EndTrace() { 1449 return Status::NotSupported("EndTrace() is not implemented."); 1450 } 1451 1452 // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing. StartBlockCacheTrace(const TraceOptions &,std::unique_ptr<TraceWriter> &&)1453 virtual Status StartBlockCacheTrace( 1454 const TraceOptions& /*options*/, 1455 std::unique_ptr<TraceWriter>&& /*trace_writer*/) { 1456 return Status::NotSupported("StartBlockCacheTrace() is not implemented."); 1457 } 1458 EndBlockCacheTrace()1459 virtual Status EndBlockCacheTrace() { 1460 return Status::NotSupported("EndBlockCacheTrace() is not implemented."); 1461 } 1462 #endif // ROCKSDB_LITE 1463 1464 // Needed for StackableDB GetRootDB()1465 virtual DB* GetRootDB() { return this; } 1466 1467 // Given a window [start_time, end_time), setup a StatsHistoryIterator 1468 // to access stats history. Note the start_time and end_time are epoch 1469 // time measured in seconds, and end_time is an exclusive bound. GetStatsHistory(uint64_t,uint64_t,std::unique_ptr<StatsHistoryIterator> *)1470 virtual Status GetStatsHistory( 1471 uint64_t /*start_time*/, uint64_t /*end_time*/, 1472 std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) { 1473 return Status::NotSupported("GetStatsHistory() is not implemented."); 1474 } 1475 1476 #ifndef ROCKSDB_LITE 1477 // Make the secondary instance catch up with the primary by tailing and 1478 // replaying the MANIFEST and WAL of the primary. 1479 // Column families created by the primary after the secondary instance starts 1480 // will be ignored unless the secondary instance closes and restarts with the 1481 // newly created column families. 1482 // Column families that exist before secondary instance starts and dropped by 1483 // the primary afterwards will be marked as dropped. However, as long as the 1484 // secondary instance does not delete the corresponding column family 1485 // handles, the data of the column family is still accessible to the 1486 // secondary. 1487 // TODO: we will support WAL tailing soon. TryCatchUpWithPrimary()1488 virtual Status TryCatchUpWithPrimary() { 1489 return Status::NotSupported("Supported only by secondary instance"); 1490 } 1491 #endif // !ROCKSDB_LITE 1492 }; 1493 1494 // Destroy the contents of the specified database. 1495 // Be very careful using this method. 1496 Status DestroyDB(const std::string& name, const Options& options, 1497 const std::vector<ColumnFamilyDescriptor>& column_families = 1498 std::vector<ColumnFamilyDescriptor>()); 1499 1500 #ifndef ROCKSDB_LITE 1501 // If a DB cannot be opened, you may attempt to call this method to 1502 // resurrect as much of the contents of the database as possible. 1503 // Some data may be lost, so be careful when calling this function 1504 // on a database that contains important information. 1505 // 1506 // With this API, we will warn and skip data associated with column families not 1507 // specified in column_families. 1508 // 1509 // @param column_families Descriptors for known column families 1510 Status RepairDB(const std::string& dbname, const DBOptions& db_options, 1511 const std::vector<ColumnFamilyDescriptor>& column_families); 1512 1513 // @param unknown_cf_opts Options for column families encountered during the 1514 // repair that were not specified in column_families. 1515 Status RepairDB(const std::string& dbname, const DBOptions& db_options, 1516 const std::vector<ColumnFamilyDescriptor>& column_families, 1517 const ColumnFamilyOptions& unknown_cf_opts); 1518 1519 // @param options These options will be used for the database and for ALL column 1520 // families encountered during the repair 1521 Status RepairDB(const std::string& dbname, const Options& options); 1522 1523 #endif 1524 1525 } // namespace ROCKSDB_NAMESPACE 1526