1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 6 #pragma once 7 #ifndef ROCKSDB_LITE 8 9 #include <string> 10 #include <utility> 11 #include <vector> 12 13 #include "rocksdb/comparator.h" 14 #include "rocksdb/db.h" 15 #include "rocksdb/utilities/stackable_db.h" 16 #include "rocksdb/utilities/transaction.h" 17 18 // Database with Transaction support. 19 // 20 // See transaction.h and examples/transaction_example.cc 21 22 namespace ROCKSDB_NAMESPACE { 23 24 class TransactionDBMutexFactory; 25 26 enum TxnDBWritePolicy { 27 WRITE_COMMITTED = 0, // write only the committed data 28 WRITE_PREPARED, // write data after the prepare phase of 2pc 29 WRITE_UNPREPARED // write data before the prepare phase of 2pc 30 }; 31 32 const uint32_t kInitialMaxDeadlocks = 5; 33 34 struct TransactionDBOptions { 35 // Specifies the maximum number of keys that can be locked at the same time 36 // per column family. 37 // If the number of locked keys is greater than max_num_locks, transaction 38 // writes (or GetForUpdate) will return an error. 39 // If this value is not positive, no limit will be enforced. 40 int64_t max_num_locks = -1; 41 42 // Stores the number of latest deadlocks to track 43 uint32_t max_num_deadlocks = kInitialMaxDeadlocks; 44 45 // Increasing this value will increase the concurrency by dividing the lock 46 // table (per column family) into more sub-tables, each with their own 47 // separate 48 // mutex. 49 size_t num_stripes = 16; 50 51 // If positive, specifies the default wait timeout in milliseconds when 52 // a transaction attempts to lock a key if not specified by 53 // TransactionOptions::lock_timeout. 54 // 55 // If 0, no waiting is done if a lock cannot instantly be acquired. 56 // If negative, there is no timeout. Not using a timeout is not recommended 57 // as it can lead to deadlocks. Currently, there is no deadlock-detection to 58 // recover 59 // from a deadlock. 60 int64_t transaction_lock_timeout = 1000; // 1 second 61 62 // If positive, specifies the wait timeout in milliseconds when writing a key 63 // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write() 64 // directly). 65 // If 0, no waiting is done if a lock cannot instantly be acquired. 66 // If negative, there is no timeout and will block indefinitely when acquiring 67 // a lock. 68 // 69 // Not using a timeout can lead to deadlocks. Currently, there 70 // is no deadlock-detection to recover from a deadlock. While DB writes 71 // cannot deadlock with other DB writes, they can deadlock with a transaction. 72 // A negative timeout should only be used if all transactions have a small 73 // expiration set. 74 int64_t default_lock_timeout = 1000; // 1 second 75 76 // If set, the TransactionDB will use this implementation of a mutex and 77 // condition variable for all transaction locking instead of the default 78 // mutex/condvar implementation. 79 std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory; 80 81 // The policy for when to write the data into the DB. The default policy is to 82 // write only the committed data (WRITE_COMMITTED). The data could be written 83 // before the commit phase. The DB then needs to provide the mechanisms to 84 // tell apart committed from uncommitted data. 85 TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED; 86 87 // TODO(myabandeh): remove this option 88 // Note: this is a temporary option as a hot fix in rollback of writeprepared 89 // txns in myrocks. MyRocks uses merge operands for autoinc column id without 90 // however obtaining locks. This breaks the assumption behind the rollback 91 // logic in myrocks. This hack of simply not rolling back merge operands works 92 // for the special way that myrocks uses this operands. 93 bool rollback_merge_operands = false; 94 95 // If true, the TransactionDB implementation might skip concurrency control 96 // unless it is overridden by TransactionOptions or 97 // TransactionDBWriteOptimizations. This can be used in conjuction with 98 // DBOptions::unordered_write when the TransactionDB is used solely for write 99 // ordering rather than concurrency control. 100 bool skip_concurrency_control = false; 101 102 // This option is only valid for write unprepared. If a write batch exceeds 103 // this threshold, then the transaction will implicitly flush the currently 104 // pending writes into the database. A value of 0 or less means no limit. 105 int64_t default_write_batch_flush_threshold = 0; 106 107 private: 108 // 128 entries 109 size_t wp_snapshot_cache_bits = static_cast<size_t>(7); 110 // 8m entry, 64MB size 111 size_t wp_commit_cache_bits = static_cast<size_t>(23); 112 113 // For testing, whether transaction name should be auto-generated or not. This 114 // is useful for write unprepared which requires named transactions. 115 bool autogenerate_name = false; 116 117 friend class WritePreparedTxnDB; 118 friend class WriteUnpreparedTxn; 119 friend class WritePreparedTransactionTestBase; 120 friend class TransactionTestBase; 121 friend class MySQLStyleTransactionTest; 122 }; 123 124 struct TransactionOptions { 125 // Setting set_snapshot=true is the same as calling 126 // Transaction::SetSnapshot(). 127 bool set_snapshot = false; 128 129 // Setting to true means that before acquiring locks, this transaction will 130 // check if doing so will cause a deadlock. If so, it will return with 131 // Status::Busy. The user should retry their transaction. 132 bool deadlock_detect = false; 133 134 // If set, it states that the CommitTimeWriteBatch represents the latest state 135 // of the application, has only one sub-batch, i.e., no duplicate keys, and 136 // meant to be used later during recovery. It enables an optimization to 137 // postpone updating the memtable with CommitTimeWriteBatch to only 138 // SwitchMemtable or recovery. 139 bool use_only_the_last_commit_time_batch_for_recovery = false; 140 141 // TODO(agiardullo): TransactionDB does not yet support comparators that allow 142 // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only 143 // return 0 if 144 // a.compare(b) returns 0. 145 146 // If positive, specifies the wait timeout in milliseconds when 147 // a transaction attempts to lock a key. 148 // 149 // If 0, no waiting is done if a lock cannot instantly be acquired. 150 // If negative, TransactionDBOptions::transaction_lock_timeout will be used. 151 int64_t lock_timeout = -1; 152 153 // Expiration duration in milliseconds. If non-negative, transactions that 154 // last longer than this many milliseconds will fail to commit. If not set, 155 // a forgotten transaction that is never committed, rolled back, or deleted 156 // will never relinquish any locks it holds. This could prevent keys from 157 // being written by other writers. 158 int64_t expiration = -1; 159 160 // The number of traversals to make during deadlock detection. 161 int64_t deadlock_detect_depth = 50; 162 163 // The maximum number of bytes used for the write batch. 0 means no limit. 164 size_t max_write_batch_size = 0; 165 166 // Skip Concurrency Control. This could be as an optimization if the 167 // application knows that the transaction would not have any conflict with 168 // concurrent transactions. It could also be used during recovery if (i) 169 // application guarantees no conflict between prepared transactions in the WAL 170 // (ii) application guarantees that recovered transactions will be rolled 171 // back/commit before new transactions start. 172 // Default: false 173 bool skip_concurrency_control = false; 174 175 // See TransactionDBOptions::default_write_batch_flush_threshold for 176 // description. If a negative value is specified, then the default value from 177 // TransactionDBOptions is used. 178 int64_t write_batch_flush_threshold = -1; 179 }; 180 181 // The per-write optimizations that do not involve transactions. TransactionDB 182 // implementation might or might not make use of the specified optimizations. 183 struct TransactionDBWriteOptimizations { 184 // If it is true it means that the application guarantees that the 185 // key-set in the write batch do not conflict with any concurrent transaction 186 // and hence the concurrency control mechanism could be skipped for this 187 // write. 188 bool skip_concurrency_control = false; 189 // If true, the application guarantees that there is no duplicate <column 190 // family, key> in the write batch and any employed mechanism to handle 191 // duplicate keys could be skipped. 192 bool skip_duplicate_key_check = false; 193 }; 194 195 struct KeyLockInfo { 196 std::string key; 197 std::vector<TransactionID> ids; 198 bool exclusive; 199 }; 200 201 struct DeadlockInfo { 202 TransactionID m_txn_id; 203 uint32_t m_cf_id; 204 bool m_exclusive; 205 std::string m_waiting_key; 206 }; 207 208 struct DeadlockPath { 209 std::vector<DeadlockInfo> path; 210 bool limit_exceeded; 211 int64_t deadlock_time; 212 DeadlockPathDeadlockPath213 explicit DeadlockPath(std::vector<DeadlockInfo> path_entry, 214 const int64_t& dl_time) 215 : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} 216 217 // empty path, limit exceeded constructor and default constructor 218 explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false) 219 : path(0), limit_exceeded(limit), deadlock_time(dl_time) {} 220 emptyDeadlockPath221 bool empty() { return path.empty() && !limit_exceeded; } 222 }; 223 224 class TransactionDB : public StackableDB { 225 public: 226 // Optimized version of ::Write that receives more optimization request such 227 // as skip_concurrency_control. 228 using StackableDB::Write; Write(const WriteOptions & opts,const TransactionDBWriteOptimizations &,WriteBatch * updates)229 virtual Status Write(const WriteOptions& opts, 230 const TransactionDBWriteOptimizations&, 231 WriteBatch* updates) { 232 // The default implementation ignores TransactionDBWriteOptimizations and 233 // falls back to the un-optimized version of ::Write 234 return Write(opts, updates); 235 } 236 // Open a TransactionDB similar to DB::Open(). 237 // Internally call PrepareWrap() and WrapDB() 238 // If the return status is not ok, then dbptr is set to nullptr. 239 static Status Open(const Options& options, 240 const TransactionDBOptions& txn_db_options, 241 const std::string& dbname, TransactionDB** dbptr); 242 243 static Status Open(const DBOptions& db_options, 244 const TransactionDBOptions& txn_db_options, 245 const std::string& dbname, 246 const std::vector<ColumnFamilyDescriptor>& column_families, 247 std::vector<ColumnFamilyHandle*>* handles, 248 TransactionDB** dbptr); 249 // Note: PrepareWrap() may change parameters, make copies before the 250 // invocation if needed. 251 static void PrepareWrap(DBOptions* db_options, 252 std::vector<ColumnFamilyDescriptor>* column_families, 253 std::vector<size_t>* compaction_enabled_cf_indices); 254 // If the return status is not ok, then dbptr will bet set to nullptr. The 255 // input db parameter might or might not be deleted as a result of the 256 // failure. If it is properly deleted it will be set to nullptr. If the return 257 // status is ok, the ownership of db is transferred to dbptr. 258 static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options, 259 const std::vector<size_t>& compaction_enabled_cf_indices, 260 const std::vector<ColumnFamilyHandle*>& handles, 261 TransactionDB** dbptr); 262 // If the return status is not ok, then dbptr will bet set to nullptr. The 263 // input db parameter might or might not be deleted as a result of the 264 // failure. If it is properly deleted it will be set to nullptr. If the return 265 // status is ok, the ownership of db is transferred to dbptr. 266 static Status WrapStackableDB( 267 StackableDB* db, const TransactionDBOptions& txn_db_options, 268 const std::vector<size_t>& compaction_enabled_cf_indices, 269 const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr); 270 // Since the destructor in StackableDB is virtual, this destructor is virtual 271 // too. The root db will be deleted by the base's destructor. ~TransactionDB()272 ~TransactionDB() override {} 273 274 // Starts a new Transaction. 275 // 276 // Caller is responsible for deleting the returned transaction when no 277 // longer needed. 278 // 279 // If old_txn is not null, BeginTransaction will reuse this Transaction 280 // handle instead of allocating a new one. This is an optimization to avoid 281 // extra allocations when repeatedly creating transactions. 282 virtual Transaction* BeginTransaction( 283 const WriteOptions& write_options, 284 const TransactionOptions& txn_options = TransactionOptions(), 285 Transaction* old_txn = nullptr) = 0; 286 287 virtual Transaction* GetTransactionByName(const TransactionName& name) = 0; 288 virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0; 289 290 // Returns set of all locks held. 291 // 292 // The mapping is column family id -> KeyLockInfo 293 virtual std::unordered_multimap<uint32_t, KeyLockInfo> 294 GetLockStatusData() = 0; 295 virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0; 296 virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0; 297 298 protected: 299 // To Create an TransactionDB, call Open() 300 // The ownership of db is transferred to the base StackableDB TransactionDB(DB * db)301 explicit TransactionDB(DB* db) : StackableDB(db) {} 302 // No copying allowed 303 TransactionDB(const TransactionDB&) = delete; 304 void operator=(const TransactionDB&) = delete; 305 }; 306 307 } // namespace ROCKSDB_NAMESPACE 308 309 #endif // ROCKSDB_LITE 310