1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 6 #pragma once 7 #ifndef ROCKSDB_LITE 8 9 #include <string> 10 #include <utility> 11 #include <vector> 12 13 #include "rocksdb/comparator.h" 14 #include "rocksdb/db.h" 15 #include "rocksdb/utilities/stackable_db.h" 16 #include "rocksdb/utilities/transaction.h" 17 18 // Database with Transaction support. 19 // 20 // See transaction.h and examples/transaction_example.cc 21 22 namespace ROCKSDB_NAMESPACE { 23 24 class TransactionDBMutexFactory; 25 26 enum TxnDBWritePolicy { 27 WRITE_COMMITTED = 0, // write only the committed data 28 WRITE_PREPARED, // write data after the prepare phase of 2pc 29 WRITE_UNPREPARED // write data before the prepare phase of 2pc 30 }; 31 32 const uint32_t kInitialMaxDeadlocks = 5; 33 34 class LockManager; 35 struct RangeLockInfo; 36 37 // A lock manager handle 38 // The workflow is as follows: 39 // * Use a factory method (like NewRangeLockManager()) to create a lock 40 // manager and get its handle. 41 // * A Handle for a particular kind of lock manager will have extra 42 // methods and parameters to control the lock manager 43 // * Pass the handle to RocksDB in TransactionDBOptions::lock_mgr_handle. It 44 // will be used to perform locking. 45 class LockManagerHandle { 46 public: 47 // PessimisticTransactionDB will call this to get the Lock Manager it's going 48 // to use. 49 virtual LockManager* getLockManager() = 0; 50 ~LockManagerHandle()51 virtual ~LockManagerHandle() {} 52 }; 53 54 // Same as class Endpoint, but use std::string to manage the buffer allocation 55 struct EndpointWithString { 56 std::string slice; 57 bool inf_suffix; 58 }; 59 60 struct RangeDeadlockInfo { 61 TransactionID m_txn_id; 62 uint32_t m_cf_id; 63 bool m_exclusive; 64 65 EndpointWithString m_start; 66 EndpointWithString m_end; 67 }; 68 69 struct RangeDeadlockPath { 70 std::vector<RangeDeadlockInfo> path; 71 bool limit_exceeded; 72 int64_t deadlock_time; 73 RangeDeadlockPathRangeDeadlockPath74 explicit RangeDeadlockPath(std::vector<RangeDeadlockInfo> path_entry, 75 const int64_t& dl_time) 76 : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} 77 78 // empty path, limit exceeded constructor and default constructor 79 explicit RangeDeadlockPath(const int64_t& dl_time = 0, bool limit = false) 80 : path(0), limit_exceeded(limit), deadlock_time(dl_time) {} 81 emptyRangeDeadlockPath82 bool empty() { return path.empty() && !limit_exceeded; } 83 }; 84 85 // A handle to control RangeLockManager (Range-based lock manager) from outside 86 // RocksDB 87 class RangeLockManagerHandle : public LockManagerHandle { 88 public: 89 // Set total amount of lock memory to use. 90 // 91 // @return 0 Ok 92 // @return EDOM Failed to set because currently using more memory than 93 // specified 94 virtual int SetMaxLockMemory(size_t max_lock_memory) = 0; 95 virtual size_t GetMaxLockMemory() = 0; 96 97 using RangeLockStatus = 98 std::unordered_multimap<ColumnFamilyId, RangeLockInfo>; 99 100 virtual RangeLockStatus GetRangeLockStatusData() = 0; 101 102 class Counters { 103 public: 104 // Number of times lock escalation was triggered (for all column families) 105 uint64_t escalation_count; 106 107 // How much memory is currently used for locks (total for all column 108 // families) 109 uint64_t current_lock_memory; 110 }; 111 112 // Get the current counter values 113 virtual Counters GetStatus() = 0; 114 115 // Functions for range-based Deadlock reporting. 116 virtual std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() = 0; 117 virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0; 118 ~RangeLockManagerHandle()119 virtual ~RangeLockManagerHandle() {} 120 }; 121 122 // A factory function to create a Range Lock Manager. The created object should 123 // be: 124 // 1. Passed in TransactionDBOptions::lock_mgr_handle to open the database in 125 // range-locking mode 126 // 2. Used to control the lock manager when the DB is already open. 127 RangeLockManagerHandle* NewRangeLockManager( 128 std::shared_ptr<TransactionDBMutexFactory> mutex_factory); 129 130 struct TransactionDBOptions { 131 // Specifies the maximum number of keys that can be locked at the same time 132 // per column family. 133 // If the number of locked keys is greater than max_num_locks, transaction 134 // writes (or GetForUpdate) will return an error. 135 // If this value is not positive, no limit will be enforced. 136 int64_t max_num_locks = -1; 137 138 // Stores the number of latest deadlocks to track 139 uint32_t max_num_deadlocks = kInitialMaxDeadlocks; 140 141 // Increasing this value will increase the concurrency by dividing the lock 142 // table (per column family) into more sub-tables, each with their own 143 // separate 144 // mutex. 145 size_t num_stripes = 16; 146 147 // If positive, specifies the default wait timeout in milliseconds when 148 // a transaction attempts to lock a key if not specified by 149 // TransactionOptions::lock_timeout. 150 // 151 // If 0, no waiting is done if a lock cannot instantly be acquired. 152 // If negative, there is no timeout. Not using a timeout is not recommended 153 // as it can lead to deadlocks. Currently, there is no deadlock-detection to 154 // recover 155 // from a deadlock. 156 int64_t transaction_lock_timeout = 1000; // 1 second 157 158 // If positive, specifies the wait timeout in milliseconds when writing a key 159 // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write() 160 // directly). 161 // If 0, no waiting is done if a lock cannot instantly be acquired. 162 // If negative, there is no timeout and will block indefinitely when acquiring 163 // a lock. 164 // 165 // Not using a timeout can lead to deadlocks. Currently, there 166 // is no deadlock-detection to recover from a deadlock. While DB writes 167 // cannot deadlock with other DB writes, they can deadlock with a transaction. 168 // A negative timeout should only be used if all transactions have a small 169 // expiration set. 170 int64_t default_lock_timeout = 1000; // 1 second 171 172 // If set, the TransactionDB will use this implementation of a mutex and 173 // condition variable for all transaction locking instead of the default 174 // mutex/condvar implementation. 175 std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory; 176 177 // The policy for when to write the data into the DB. The default policy is to 178 // write only the committed data (WRITE_COMMITTED). The data could be written 179 // before the commit phase. The DB then needs to provide the mechanisms to 180 // tell apart committed from uncommitted data. 181 TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED; 182 183 // TODO(myabandeh): remove this option 184 // Note: this is a temporary option as a hot fix in rollback of writeprepared 185 // txns in myrocks. MyRocks uses merge operands for autoinc column id without 186 // however obtaining locks. This breaks the assumption behind the rollback 187 // logic in myrocks. This hack of simply not rolling back merge operands works 188 // for the special way that myrocks uses this operands. 189 bool rollback_merge_operands = false; 190 191 // nullptr means use default lock manager. 192 // Other value means the user provides a custom lock manager. 193 std::shared_ptr<LockManagerHandle> lock_mgr_handle; 194 195 // If true, the TransactionDB implementation might skip concurrency control 196 // unless it is overridden by TransactionOptions or 197 // TransactionDBWriteOptimizations. This can be used in conjunction with 198 // DBOptions::unordered_write when the TransactionDB is used solely for write 199 // ordering rather than concurrency control. 200 bool skip_concurrency_control = false; 201 202 // This option is only valid for write unprepared. If a write batch exceeds 203 // this threshold, then the transaction will implicitly flush the currently 204 // pending writes into the database. A value of 0 or less means no limit. 205 int64_t default_write_batch_flush_threshold = 0; 206 207 private: 208 // 128 entries 209 size_t wp_snapshot_cache_bits = static_cast<size_t>(7); 210 // 8m entry, 64MB size 211 size_t wp_commit_cache_bits = static_cast<size_t>(23); 212 213 // For testing, whether transaction name should be auto-generated or not. This 214 // is useful for write unprepared which requires named transactions. 215 bool autogenerate_name = false; 216 217 friend class WritePreparedTxnDB; 218 friend class WriteUnpreparedTxn; 219 friend class WritePreparedTransactionTestBase; 220 friend class TransactionTestBase; 221 friend class MySQLStyleTransactionTest; 222 }; 223 224 struct TransactionOptions { 225 // Setting set_snapshot=true is the same as calling 226 // Transaction::SetSnapshot(). 227 bool set_snapshot = false; 228 229 // Setting to true means that before acquiring locks, this transaction will 230 // check if doing so will cause a deadlock. If so, it will return with 231 // Status::Busy. The user should retry their transaction. 232 bool deadlock_detect = false; 233 234 // If set, it states that the CommitTimeWriteBatch represents the latest state 235 // of the application, has only one sub-batch, i.e., no duplicate keys, and 236 // meant to be used later during recovery. It enables an optimization to 237 // postpone updating the memtable with CommitTimeWriteBatch to only 238 // SwitchMemtable or recovery. 239 bool use_only_the_last_commit_time_batch_for_recovery = false; 240 241 // TODO(agiardullo): TransactionDB does not yet support comparators that allow 242 // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only 243 // return 0 if 244 // a.compare(b) returns 0. 245 246 // If positive, specifies the wait timeout in milliseconds when 247 // a transaction attempts to lock a key. 248 // 249 // If 0, no waiting is done if a lock cannot instantly be acquired. 250 // If negative, TransactionDBOptions::transaction_lock_timeout will be used. 251 int64_t lock_timeout = -1; 252 253 // Expiration duration in milliseconds. If non-negative, transactions that 254 // last longer than this many milliseconds will fail to commit. If not set, 255 // a forgotten transaction that is never committed, rolled back, or deleted 256 // will never relinquish any locks it holds. This could prevent keys from 257 // being written by other writers. 258 int64_t expiration = -1; 259 260 // The number of traversals to make during deadlock detection. 261 int64_t deadlock_detect_depth = 50; 262 263 // The maximum number of bytes used for the write batch. 0 means no limit. 264 size_t max_write_batch_size = 0; 265 266 // Skip Concurrency Control. This could be as an optimization if the 267 // application knows that the transaction would not have any conflict with 268 // concurrent transactions. It could also be used during recovery if (i) 269 // application guarantees no conflict between prepared transactions in the WAL 270 // (ii) application guarantees that recovered transactions will be rolled 271 // back/commit before new transactions start. 272 // Default: false 273 bool skip_concurrency_control = false; 274 275 // In pessimistic transaction, if this is true, then you can skip Prepare 276 // before Commit, otherwise, you must Prepare before Commit. 277 bool skip_prepare = true; 278 279 // See TransactionDBOptions::default_write_batch_flush_threshold for 280 // description. If a negative value is specified, then the default value from 281 // TransactionDBOptions is used. 282 int64_t write_batch_flush_threshold = -1; 283 }; 284 285 // The per-write optimizations that do not involve transactions. TransactionDB 286 // implementation might or might not make use of the specified optimizations. 287 struct TransactionDBWriteOptimizations { 288 // If it is true it means that the application guarantees that the 289 // key-set in the write batch do not conflict with any concurrent transaction 290 // and hence the concurrency control mechanism could be skipped for this 291 // write. 292 bool skip_concurrency_control = false; 293 // If true, the application guarantees that there is no duplicate <column 294 // family, key> in the write batch and any employed mechanism to handle 295 // duplicate keys could be skipped. 296 bool skip_duplicate_key_check = false; 297 }; 298 299 struct KeyLockInfo { 300 std::string key; 301 std::vector<TransactionID> ids; 302 bool exclusive; 303 }; 304 305 struct RangeLockInfo { 306 EndpointWithString start; 307 EndpointWithString end; 308 std::vector<TransactionID> ids; 309 bool exclusive; 310 }; 311 312 struct DeadlockInfo { 313 TransactionID m_txn_id; 314 uint32_t m_cf_id; 315 bool m_exclusive; 316 std::string m_waiting_key; 317 }; 318 319 struct DeadlockPath { 320 std::vector<DeadlockInfo> path; 321 bool limit_exceeded; 322 int64_t deadlock_time; 323 DeadlockPathDeadlockPath324 explicit DeadlockPath(std::vector<DeadlockInfo> path_entry, 325 const int64_t& dl_time) 326 : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} 327 328 // empty path, limit exceeded constructor and default constructor 329 explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false) 330 : path(0), limit_exceeded(limit), deadlock_time(dl_time) {} 331 emptyDeadlockPath332 bool empty() { return path.empty() && !limit_exceeded; } 333 }; 334 335 class TransactionDB : public StackableDB { 336 public: 337 // Optimized version of ::Write that receives more optimization request such 338 // as skip_concurrency_control. 339 using StackableDB::Write; Write(const WriteOptions & opts,const TransactionDBWriteOptimizations &,WriteBatch * updates)340 virtual Status Write(const WriteOptions& opts, 341 const TransactionDBWriteOptimizations&, 342 WriteBatch* updates) { 343 // The default implementation ignores TransactionDBWriteOptimizations and 344 // falls back to the un-optimized version of ::Write 345 return Write(opts, updates); 346 } 347 // Transactional `DeleteRange()` is not yet supported. 348 // However, users who know their deleted range does not conflict with 349 // anything can still use it via the `Write()` API. In all cases, the 350 // `Write()` overload specifying `TransactionDBWriteOptimizations` must be 351 // used and `skip_concurrency_control` must be set. When using either 352 // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must 353 // additionally be set. DeleteRange(const WriteOptions &,ColumnFamilyHandle *,const Slice &,const Slice &)354 virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, 355 const Slice&, const Slice&) override { 356 return Status::NotSupported(); 357 } 358 // Open a TransactionDB similar to DB::Open(). 359 // Internally call PrepareWrap() and WrapDB() 360 // If the return status is not ok, then dbptr is set to nullptr. 361 static Status Open(const Options& options, 362 const TransactionDBOptions& txn_db_options, 363 const std::string& dbname, TransactionDB** dbptr); 364 365 static Status Open(const DBOptions& db_options, 366 const TransactionDBOptions& txn_db_options, 367 const std::string& dbname, 368 const std::vector<ColumnFamilyDescriptor>& column_families, 369 std::vector<ColumnFamilyHandle*>* handles, 370 TransactionDB** dbptr); 371 // Note: PrepareWrap() may change parameters, make copies before the 372 // invocation if needed. 373 static void PrepareWrap(DBOptions* db_options, 374 std::vector<ColumnFamilyDescriptor>* column_families, 375 std::vector<size_t>* compaction_enabled_cf_indices); 376 // If the return status is not ok, then dbptr will bet set to nullptr. The 377 // input db parameter might or might not be deleted as a result of the 378 // failure. If it is properly deleted it will be set to nullptr. If the return 379 // status is ok, the ownership of db is transferred to dbptr. 380 static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options, 381 const std::vector<size_t>& compaction_enabled_cf_indices, 382 const std::vector<ColumnFamilyHandle*>& handles, 383 TransactionDB** dbptr); 384 // If the return status is not ok, then dbptr will bet set to nullptr. The 385 // input db parameter might or might not be deleted as a result of the 386 // failure. If it is properly deleted it will be set to nullptr. If the return 387 // status is ok, the ownership of db is transferred to dbptr. 388 static Status WrapStackableDB( 389 StackableDB* db, const TransactionDBOptions& txn_db_options, 390 const std::vector<size_t>& compaction_enabled_cf_indices, 391 const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr); 392 // Since the destructor in StackableDB is virtual, this destructor is virtual 393 // too. The root db will be deleted by the base's destructor. ~TransactionDB()394 ~TransactionDB() override {} 395 396 // Starts a new Transaction. 397 // 398 // Caller is responsible for deleting the returned transaction when no 399 // longer needed. 400 // 401 // If old_txn is not null, BeginTransaction will reuse this Transaction 402 // handle instead of allocating a new one. This is an optimization to avoid 403 // extra allocations when repeatedly creating transactions. 404 virtual Transaction* BeginTransaction( 405 const WriteOptions& write_options, 406 const TransactionOptions& txn_options = TransactionOptions(), 407 Transaction* old_txn = nullptr) = 0; 408 409 virtual Transaction* GetTransactionByName(const TransactionName& name) = 0; 410 virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0; 411 412 // Returns set of all locks held. 413 // 414 // The mapping is column family id -> KeyLockInfo 415 virtual std::unordered_multimap<uint32_t, KeyLockInfo> 416 GetLockStatusData() = 0; 417 418 virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0; 419 virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0; 420 421 protected: 422 // To Create an TransactionDB, call Open() 423 // The ownership of db is transferred to the base StackableDB TransactionDB(DB * db)424 explicit TransactionDB(DB* db) : StackableDB(db) {} 425 // No copying allowed 426 TransactionDB(const TransactionDB&) = delete; 427 void operator=(const TransactionDB&) = delete; 428 }; 429 430 } // namespace ROCKSDB_NAMESPACE 431 432 #endif // ROCKSDB_LITE 433