1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #pragma once
7 #ifndef ROCKSDB_LITE
8 
9 #include <string>
10 #include <utility>
11 #include <vector>
12 
13 #include "rocksdb/comparator.h"
14 #include "rocksdb/db.h"
15 #include "rocksdb/utilities/stackable_db.h"
16 #include "rocksdb/utilities/transaction.h"
17 
18 // Database with Transaction support.
19 //
20 // See transaction.h and examples/transaction_example.cc
21 
22 namespace ROCKSDB_NAMESPACE {
23 
24 class TransactionDBMutexFactory;
25 
26 enum TxnDBWritePolicy {
27   WRITE_COMMITTED = 0,  // write only the committed data
28   WRITE_PREPARED,  // write data after the prepare phase of 2pc
29   WRITE_UNPREPARED  // write data before the prepare phase of 2pc
30 };
31 
32 const uint32_t kInitialMaxDeadlocks = 5;
33 
34 struct TransactionDBOptions {
35   // Specifies the maximum number of keys that can be locked at the same time
36   // per column family.
37   // If the number of locked keys is greater than max_num_locks, transaction
38   // writes (or GetForUpdate) will return an error.
39   // If this value is not positive, no limit will be enforced.
40   int64_t max_num_locks = -1;
41 
42   // Stores the number of latest deadlocks to track
43   uint32_t max_num_deadlocks = kInitialMaxDeadlocks;
44 
45   // Increasing this value will increase the concurrency by dividing the lock
46   // table (per column family) into more sub-tables, each with their own
47   // separate
48   // mutex.
49   size_t num_stripes = 16;
50 
51   // If positive, specifies the default wait timeout in milliseconds when
52   // a transaction attempts to lock a key if not specified by
53   // TransactionOptions::lock_timeout.
54   //
55   // If 0, no waiting is done if a lock cannot instantly be acquired.
56   // If negative, there is no timeout.  Not using a timeout is not recommended
57   // as it can lead to deadlocks.  Currently, there is no deadlock-detection to
58   // recover
59   // from a deadlock.
60   int64_t transaction_lock_timeout = 1000;  // 1 second
61 
62   // If positive, specifies the wait timeout in milliseconds when writing a key
63   // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
64   // directly).
65   // If 0, no waiting is done if a lock cannot instantly be acquired.
66   // If negative, there is no timeout and will block indefinitely when acquiring
67   // a lock.
68   //
69   // Not using a timeout can lead to deadlocks.  Currently, there
70   // is no deadlock-detection to recover from a deadlock.  While DB writes
71   // cannot deadlock with other DB writes, they can deadlock with a transaction.
72   // A negative timeout should only be used if all transactions have a small
73   // expiration set.
74   int64_t default_lock_timeout = 1000;  // 1 second
75 
76   // If set, the TransactionDB will use this implementation of a mutex and
77   // condition variable for all transaction locking instead of the default
78   // mutex/condvar implementation.
79   std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
80 
81   // The policy for when to write the data into the DB. The default policy is to
82   // write only the committed data (WRITE_COMMITTED). The data could be written
83   // before the commit phase. The DB then needs to provide the mechanisms to
84   // tell apart committed from uncommitted data.
85   TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
86 
87   // TODO(myabandeh): remove this option
88   // Note: this is a temporary option as a hot fix in rollback of writeprepared
89   // txns in myrocks. MyRocks uses merge operands for autoinc column id without
90   // however obtaining locks. This breaks the assumption behind the rollback
91   // logic in myrocks. This hack of simply not rolling back merge operands works
92   // for the special way that myrocks uses this operands.
93   bool rollback_merge_operands = false;
94 
95   // If true, the TransactionDB implementation might skip concurrency control
96   // unless it is overridden by TransactionOptions or
97   // TransactionDBWriteOptimizations. This can be used in conjuction with
98   // DBOptions::unordered_write when the TransactionDB is used solely for write
99   // ordering rather than concurrency control.
100   bool skip_concurrency_control = false;
101 
102   // This option is only valid for write unprepared. If a write batch exceeds
103   // this threshold, then the transaction will implicitly flush the currently
104   // pending writes into the database. A value of 0 or less means no limit.
105   int64_t default_write_batch_flush_threshold = 0;
106 
107  private:
108   // 128 entries
109   size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
110   // 8m entry, 64MB size
111   size_t wp_commit_cache_bits = static_cast<size_t>(23);
112 
113   // For testing, whether transaction name should be auto-generated or not. This
114   // is useful for write unprepared which requires named transactions.
115   bool autogenerate_name = false;
116 
117   friend class WritePreparedTxnDB;
118   friend class WriteUnpreparedTxn;
119   friend class WritePreparedTransactionTestBase;
120   friend class TransactionTestBase;
121   friend class MySQLStyleTransactionTest;
122 };
123 
124 struct TransactionOptions {
125   // Setting set_snapshot=true is the same as calling
126   // Transaction::SetSnapshot().
127   bool set_snapshot = false;
128 
129   // Setting to true means that before acquiring locks, this transaction will
130   // check if doing so will cause a deadlock. If so, it will return with
131   // Status::Busy.  The user should retry their transaction.
132   bool deadlock_detect = false;
133 
134   // If set, it states that the CommitTimeWriteBatch represents the latest state
135   // of the application, has only one sub-batch, i.e., no duplicate keys,  and
136   // meant to be used later during recovery. It enables an optimization to
137   // postpone updating the memtable with CommitTimeWriteBatch to only
138   // SwitchMemtable or recovery.
139   bool use_only_the_last_commit_time_batch_for_recovery = false;
140 
141   // TODO(agiardullo): TransactionDB does not yet support comparators that allow
142   // two non-equal keys to be equivalent.  Ie, cmp->Compare(a,b) should only
143   // return 0 if
144   // a.compare(b) returns 0.
145 
146   // If positive, specifies the wait timeout in milliseconds when
147   // a transaction attempts to lock a key.
148   //
149   // If 0, no waiting is done if a lock cannot instantly be acquired.
150   // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
151   int64_t lock_timeout = -1;
152 
153   // Expiration duration in milliseconds.  If non-negative, transactions that
154   // last longer than this many milliseconds will fail to commit.  If not set,
155   // a forgotten transaction that is never committed, rolled back, or deleted
156   // will never relinquish any locks it holds.  This could prevent keys from
157   // being written by other writers.
158   int64_t expiration = -1;
159 
160   // The number of traversals to make during deadlock detection.
161   int64_t deadlock_detect_depth = 50;
162 
163   // The maximum number of bytes used for the write batch. 0 means no limit.
164   size_t max_write_batch_size = 0;
165 
166   // Skip Concurrency Control. This could be as an optimization if the
167   // application knows that the transaction would not have any conflict with
168   // concurrent transactions. It could also be used during recovery if (i)
169   // application guarantees no conflict between prepared transactions in the WAL
170   // (ii) application guarantees that recovered transactions will be rolled
171   // back/commit before new transactions start.
172   // Default: false
173   bool skip_concurrency_control = false;
174 
175   // See TransactionDBOptions::default_write_batch_flush_threshold for
176   // description. If a negative value is specified, then the default value from
177   // TransactionDBOptions is used.
178   int64_t write_batch_flush_threshold = -1;
179 };
180 
181 // The per-write optimizations that do not involve transactions. TransactionDB
182 // implementation might or might not make use of the specified optimizations.
183 struct TransactionDBWriteOptimizations {
184   // If it is true it means that the application guarantees that the
185   // key-set in the write batch do not conflict with any concurrent transaction
186   // and hence the concurrency control mechanism could be skipped for this
187   // write.
188   bool skip_concurrency_control = false;
189   // If true, the application guarantees that there is no duplicate <column
190   // family, key> in the write batch and any employed mechanism to handle
191   // duplicate keys could be skipped.
192   bool skip_duplicate_key_check = false;
193 };
194 
195 struct KeyLockInfo {
196   std::string key;
197   std::vector<TransactionID> ids;
198   bool exclusive;
199 };
200 
201 struct DeadlockInfo {
202   TransactionID m_txn_id;
203   uint32_t m_cf_id;
204   bool m_exclusive;
205   std::string m_waiting_key;
206 };
207 
208 struct DeadlockPath {
209   std::vector<DeadlockInfo> path;
210   bool limit_exceeded;
211   int64_t deadlock_time;
212 
DeadlockPathDeadlockPath213   explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
214                         const int64_t& dl_time)
215       : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
216 
217   // empty path, limit exceeded constructor and default constructor
218   explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false)
219       : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
220 
emptyDeadlockPath221   bool empty() { return path.empty() && !limit_exceeded; }
222 };
223 
224 class TransactionDB : public StackableDB {
225  public:
226   // Optimized version of ::Write that receives more optimization request such
227   // as skip_concurrency_control.
228   using StackableDB::Write;
Write(const WriteOptions & opts,const TransactionDBWriteOptimizations &,WriteBatch * updates)229   virtual Status Write(const WriteOptions& opts,
230                        const TransactionDBWriteOptimizations&,
231                        WriteBatch* updates) {
232     // The default implementation ignores TransactionDBWriteOptimizations and
233     // falls back to the un-optimized version of ::Write
234     return Write(opts, updates);
235   }
236   // Open a TransactionDB similar to DB::Open().
237   // Internally call PrepareWrap() and WrapDB()
238   // If the return status is not ok, then dbptr is set to nullptr.
239   static Status Open(const Options& options,
240                      const TransactionDBOptions& txn_db_options,
241                      const std::string& dbname, TransactionDB** dbptr);
242 
243   static Status Open(const DBOptions& db_options,
244                      const TransactionDBOptions& txn_db_options,
245                      const std::string& dbname,
246                      const std::vector<ColumnFamilyDescriptor>& column_families,
247                      std::vector<ColumnFamilyHandle*>* handles,
248                      TransactionDB** dbptr);
249   // Note: PrepareWrap() may change parameters, make copies before the
250   // invocation if needed.
251   static void PrepareWrap(DBOptions* db_options,
252                           std::vector<ColumnFamilyDescriptor>* column_families,
253                           std::vector<size_t>* compaction_enabled_cf_indices);
254   // If the return status is not ok, then dbptr will bet set to nullptr. The
255   // input db parameter might or might not be deleted as a result of the
256   // failure. If it is properly deleted it will be set to nullptr. If the return
257   // status is ok, the ownership of db is transferred to dbptr.
258   static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options,
259                        const std::vector<size_t>& compaction_enabled_cf_indices,
260                        const std::vector<ColumnFamilyHandle*>& handles,
261                        TransactionDB** dbptr);
262   // If the return status is not ok, then dbptr will bet set to nullptr. The
263   // input db parameter might or might not be deleted as a result of the
264   // failure. If it is properly deleted it will be set to nullptr. If the return
265   // status is ok, the ownership of db is transferred to dbptr.
266   static Status WrapStackableDB(
267       StackableDB* db, const TransactionDBOptions& txn_db_options,
268       const std::vector<size_t>& compaction_enabled_cf_indices,
269       const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr);
270   // Since the destructor in StackableDB is virtual, this destructor is virtual
271   // too. The root db will be deleted by the base's destructor.
~TransactionDB()272   ~TransactionDB() override {}
273 
274   // Starts a new Transaction.
275   //
276   // Caller is responsible for deleting the returned transaction when no
277   // longer needed.
278   //
279   // If old_txn is not null, BeginTransaction will reuse this Transaction
280   // handle instead of allocating a new one.  This is an optimization to avoid
281   // extra allocations when repeatedly creating transactions.
282   virtual Transaction* BeginTransaction(
283       const WriteOptions& write_options,
284       const TransactionOptions& txn_options = TransactionOptions(),
285       Transaction* old_txn = nullptr) = 0;
286 
287   virtual Transaction* GetTransactionByName(const TransactionName& name) = 0;
288   virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0;
289 
290   // Returns set of all locks held.
291   //
292   // The mapping is column family id -> KeyLockInfo
293   virtual std::unordered_multimap<uint32_t, KeyLockInfo>
294   GetLockStatusData() = 0;
295   virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
296   virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
297 
298  protected:
299   // To Create an TransactionDB, call Open()
300   // The ownership of db is transferred to the base StackableDB
TransactionDB(DB * db)301   explicit TransactionDB(DB* db) : StackableDB(db) {}
302   // No copying allowed
303   TransactionDB(const TransactionDB&) = delete;
304   void operator=(const TransactionDB&) = delete;
305 };
306 
307 }  // namespace ROCKSDB_NAMESPACE
308 
309 #endif  // ROCKSDB_LITE
310