1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #pragma once
7 #ifndef ROCKSDB_LITE
8 
9 #include <string>
10 #include <utility>
11 #include <vector>
12 
13 #include "rocksdb/comparator.h"
14 #include "rocksdb/db.h"
15 #include "rocksdb/utilities/stackable_db.h"
16 #include "rocksdb/utilities/transaction.h"
17 
18 // Database with Transaction support.
19 //
20 // See transaction.h and examples/transaction_example.cc
21 
22 namespace ROCKSDB_NAMESPACE {
23 
24 class TransactionDBMutexFactory;
25 
26 enum TxnDBWritePolicy {
27   WRITE_COMMITTED = 0,  // write only the committed data
28   WRITE_PREPARED,  // write data after the prepare phase of 2pc
29   WRITE_UNPREPARED  // write data before the prepare phase of 2pc
30 };
31 
32 const uint32_t kInitialMaxDeadlocks = 5;
33 
34 class LockManager;
35 struct RangeLockInfo;
36 
37 // A lock manager handle
38 // The workflow is as follows:
39 //  * Use a factory method (like NewRangeLockManager()) to create a lock
40 //    manager and get its handle.
41 //  * A Handle for a particular kind of lock manager will have extra
42 //    methods and parameters to control the lock manager
43 //  * Pass the handle to RocksDB in TransactionDBOptions::lock_mgr_handle. It
44 //    will be used to perform locking.
45 class LockManagerHandle {
46  public:
47   // PessimisticTransactionDB will call this to get the Lock Manager it's going
48   // to use.
49   virtual LockManager* getLockManager() = 0;
50 
~LockManagerHandle()51   virtual ~LockManagerHandle() {}
52 };
53 
54 // Same as class Endpoint, but use std::string to manage the buffer allocation
55 struct EndpointWithString {
56   std::string slice;
57   bool inf_suffix;
58 };
59 
60 struct RangeDeadlockInfo {
61   TransactionID m_txn_id;
62   uint32_t m_cf_id;
63   bool m_exclusive;
64 
65   EndpointWithString m_start;
66   EndpointWithString m_end;
67 };
68 
69 struct RangeDeadlockPath {
70   std::vector<RangeDeadlockInfo> path;
71   bool limit_exceeded;
72   int64_t deadlock_time;
73 
RangeDeadlockPathRangeDeadlockPath74   explicit RangeDeadlockPath(std::vector<RangeDeadlockInfo> path_entry,
75                              const int64_t& dl_time)
76       : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
77 
78   // empty path, limit exceeded constructor and default constructor
79   explicit RangeDeadlockPath(const int64_t& dl_time = 0, bool limit = false)
80       : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
81 
emptyRangeDeadlockPath82   bool empty() { return path.empty() && !limit_exceeded; }
83 };
84 
85 // A handle to control RangeLockManager (Range-based lock manager) from outside
86 // RocksDB
87 class RangeLockManagerHandle : public LockManagerHandle {
88  public:
89   // Set total amount of lock memory to use.
90   //
91   //  @return 0 Ok
92   //  @return EDOM Failed to set because currently using more memory than
93   //        specified
94   virtual int SetMaxLockMemory(size_t max_lock_memory) = 0;
95   virtual size_t GetMaxLockMemory() = 0;
96 
97   using RangeLockStatus =
98       std::unordered_multimap<ColumnFamilyId, RangeLockInfo>;
99 
100   virtual RangeLockStatus GetRangeLockStatusData() = 0;
101 
102   class Counters {
103    public:
104     // Number of times lock escalation was triggered (for all column families)
105     uint64_t escalation_count;
106 
107     // How much memory is currently used for locks (total for all column
108     // families)
109     uint64_t current_lock_memory;
110   };
111 
112   // Get the current counter values
113   virtual Counters GetStatus() = 0;
114 
115   // Functions for range-based Deadlock reporting.
116   virtual std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() = 0;
117   virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0;
118 
~RangeLockManagerHandle()119   virtual ~RangeLockManagerHandle() {}
120 };
121 
122 // A factory function to create a Range Lock Manager. The created object should
123 // be:
124 //  1. Passed in TransactionDBOptions::lock_mgr_handle to open the database in
125 //     range-locking mode
126 //  2. Used to control the lock manager when the DB is already open.
127 RangeLockManagerHandle* NewRangeLockManager(
128     std::shared_ptr<TransactionDBMutexFactory> mutex_factory);
129 
130 struct TransactionDBOptions {
131   // Specifies the maximum number of keys that can be locked at the same time
132   // per column family.
133   // If the number of locked keys is greater than max_num_locks, transaction
134   // writes (or GetForUpdate) will return an error.
135   // If this value is not positive, no limit will be enforced.
136   int64_t max_num_locks = -1;
137 
138   // Stores the number of latest deadlocks to track
139   uint32_t max_num_deadlocks = kInitialMaxDeadlocks;
140 
141   // Increasing this value will increase the concurrency by dividing the lock
142   // table (per column family) into more sub-tables, each with their own
143   // separate
144   // mutex.
145   size_t num_stripes = 16;
146 
147   // If positive, specifies the default wait timeout in milliseconds when
148   // a transaction attempts to lock a key if not specified by
149   // TransactionOptions::lock_timeout.
150   //
151   // If 0, no waiting is done if a lock cannot instantly be acquired.
152   // If negative, there is no timeout.  Not using a timeout is not recommended
153   // as it can lead to deadlocks.  Currently, there is no deadlock-detection to
154   // recover
155   // from a deadlock.
156   int64_t transaction_lock_timeout = 1000;  // 1 second
157 
158   // If positive, specifies the wait timeout in milliseconds when writing a key
159   // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
160   // directly).
161   // If 0, no waiting is done if a lock cannot instantly be acquired.
162   // If negative, there is no timeout and will block indefinitely when acquiring
163   // a lock.
164   //
165   // Not using a timeout can lead to deadlocks.  Currently, there
166   // is no deadlock-detection to recover from a deadlock.  While DB writes
167   // cannot deadlock with other DB writes, they can deadlock with a transaction.
168   // A negative timeout should only be used if all transactions have a small
169   // expiration set.
170   int64_t default_lock_timeout = 1000;  // 1 second
171 
172   // If set, the TransactionDB will use this implementation of a mutex and
173   // condition variable for all transaction locking instead of the default
174   // mutex/condvar implementation.
175   std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
176 
177   // The policy for when to write the data into the DB. The default policy is to
178   // write only the committed data (WRITE_COMMITTED). The data could be written
179   // before the commit phase. The DB then needs to provide the mechanisms to
180   // tell apart committed from uncommitted data.
181   TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
182 
183   // TODO(myabandeh): remove this option
184   // Note: this is a temporary option as a hot fix in rollback of writeprepared
185   // txns in myrocks. MyRocks uses merge operands for autoinc column id without
186   // however obtaining locks. This breaks the assumption behind the rollback
187   // logic in myrocks. This hack of simply not rolling back merge operands works
188   // for the special way that myrocks uses this operands.
189   bool rollback_merge_operands = false;
190 
191   // nullptr means use default lock manager.
192   // Other value means the user provides a custom lock manager.
193   std::shared_ptr<LockManagerHandle> lock_mgr_handle;
194 
195   // If true, the TransactionDB implementation might skip concurrency control
196   // unless it is overridden by TransactionOptions or
197   // TransactionDBWriteOptimizations. This can be used in conjunction with
198   // DBOptions::unordered_write when the TransactionDB is used solely for write
199   // ordering rather than concurrency control.
200   bool skip_concurrency_control = false;
201 
202   // This option is only valid for write unprepared. If a write batch exceeds
203   // this threshold, then the transaction will implicitly flush the currently
204   // pending writes into the database. A value of 0 or less means no limit.
205   int64_t default_write_batch_flush_threshold = 0;
206 
207  private:
208   // 128 entries
209   size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
210   // 8m entry, 64MB size
211   size_t wp_commit_cache_bits = static_cast<size_t>(23);
212 
213   // For testing, whether transaction name should be auto-generated or not. This
214   // is useful for write unprepared which requires named transactions.
215   bool autogenerate_name = false;
216 
217   friend class WritePreparedTxnDB;
218   friend class WriteUnpreparedTxn;
219   friend class WritePreparedTransactionTestBase;
220   friend class TransactionTestBase;
221   friend class MySQLStyleTransactionTest;
222 };
223 
224 struct TransactionOptions {
225   // Setting set_snapshot=true is the same as calling
226   // Transaction::SetSnapshot().
227   bool set_snapshot = false;
228 
229   // Setting to true means that before acquiring locks, this transaction will
230   // check if doing so will cause a deadlock. If so, it will return with
231   // Status::Busy.  The user should retry their transaction.
232   bool deadlock_detect = false;
233 
234   // If set, it states that the CommitTimeWriteBatch represents the latest state
235   // of the application, has only one sub-batch, i.e., no duplicate keys,  and
236   // meant to be used later during recovery. It enables an optimization to
237   // postpone updating the memtable with CommitTimeWriteBatch to only
238   // SwitchMemtable or recovery.
239   bool use_only_the_last_commit_time_batch_for_recovery = false;
240 
241   // TODO(agiardullo): TransactionDB does not yet support comparators that allow
242   // two non-equal keys to be equivalent.  Ie, cmp->Compare(a,b) should only
243   // return 0 if
244   // a.compare(b) returns 0.
245 
246   // If positive, specifies the wait timeout in milliseconds when
247   // a transaction attempts to lock a key.
248   //
249   // If 0, no waiting is done if a lock cannot instantly be acquired.
250   // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
251   int64_t lock_timeout = -1;
252 
253   // Expiration duration in milliseconds.  If non-negative, transactions that
254   // last longer than this many milliseconds will fail to commit.  If not set,
255   // a forgotten transaction that is never committed, rolled back, or deleted
256   // will never relinquish any locks it holds.  This could prevent keys from
257   // being written by other writers.
258   int64_t expiration = -1;
259 
260   // The number of traversals to make during deadlock detection.
261   int64_t deadlock_detect_depth = 50;
262 
263   // The maximum number of bytes used for the write batch. 0 means no limit.
264   size_t max_write_batch_size = 0;
265 
266   // Skip Concurrency Control. This could be as an optimization if the
267   // application knows that the transaction would not have any conflict with
268   // concurrent transactions. It could also be used during recovery if (i)
269   // application guarantees no conflict between prepared transactions in the WAL
270   // (ii) application guarantees that recovered transactions will be rolled
271   // back/commit before new transactions start.
272   // Default: false
273   bool skip_concurrency_control = false;
274 
275   // In pessimistic transaction, if this is true, then you can skip Prepare
276   // before Commit, otherwise, you must Prepare before Commit.
277   bool skip_prepare = true;
278 
279   // See TransactionDBOptions::default_write_batch_flush_threshold for
280   // description. If a negative value is specified, then the default value from
281   // TransactionDBOptions is used.
282   int64_t write_batch_flush_threshold = -1;
283 };
284 
285 // The per-write optimizations that do not involve transactions. TransactionDB
286 // implementation might or might not make use of the specified optimizations.
287 struct TransactionDBWriteOptimizations {
288   // If it is true it means that the application guarantees that the
289   // key-set in the write batch do not conflict with any concurrent transaction
290   // and hence the concurrency control mechanism could be skipped for this
291   // write.
292   bool skip_concurrency_control = false;
293   // If true, the application guarantees that there is no duplicate <column
294   // family, key> in the write batch and any employed mechanism to handle
295   // duplicate keys could be skipped.
296   bool skip_duplicate_key_check = false;
297 };
298 
299 struct KeyLockInfo {
300   std::string key;
301   std::vector<TransactionID> ids;
302   bool exclusive;
303 };
304 
305 struct RangeLockInfo {
306   EndpointWithString start;
307   EndpointWithString end;
308   std::vector<TransactionID> ids;
309   bool exclusive;
310 };
311 
312 struct DeadlockInfo {
313   TransactionID m_txn_id;
314   uint32_t m_cf_id;
315   bool m_exclusive;
316   std::string m_waiting_key;
317 };
318 
319 struct DeadlockPath {
320   std::vector<DeadlockInfo> path;
321   bool limit_exceeded;
322   int64_t deadlock_time;
323 
DeadlockPathDeadlockPath324   explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
325                         const int64_t& dl_time)
326       : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
327 
328   // empty path, limit exceeded constructor and default constructor
329   explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false)
330       : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
331 
emptyDeadlockPath332   bool empty() { return path.empty() && !limit_exceeded; }
333 };
334 
335 class TransactionDB : public StackableDB {
336  public:
337   // Optimized version of ::Write that receives more optimization request such
338   // as skip_concurrency_control.
339   using StackableDB::Write;
Write(const WriteOptions & opts,const TransactionDBWriteOptimizations &,WriteBatch * updates)340   virtual Status Write(const WriteOptions& opts,
341                        const TransactionDBWriteOptimizations&,
342                        WriteBatch* updates) {
343     // The default implementation ignores TransactionDBWriteOptimizations and
344     // falls back to the un-optimized version of ::Write
345     return Write(opts, updates);
346   }
347   // Transactional `DeleteRange()` is not yet supported.
348   // However, users who know their deleted range does not conflict with
349   // anything can still use it via the `Write()` API. In all cases, the
350   // `Write()` overload specifying `TransactionDBWriteOptimizations` must be
351   // used and `skip_concurrency_control` must be set. When using either
352   // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must
353   // additionally be set.
DeleteRange(const WriteOptions &,ColumnFamilyHandle *,const Slice &,const Slice &)354   virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
355                              const Slice&, const Slice&) override {
356     return Status::NotSupported();
357   }
358   // Open a TransactionDB similar to DB::Open().
359   // Internally call PrepareWrap() and WrapDB()
360   // If the return status is not ok, then dbptr is set to nullptr.
361   static Status Open(const Options& options,
362                      const TransactionDBOptions& txn_db_options,
363                      const std::string& dbname, TransactionDB** dbptr);
364 
365   static Status Open(const DBOptions& db_options,
366                      const TransactionDBOptions& txn_db_options,
367                      const std::string& dbname,
368                      const std::vector<ColumnFamilyDescriptor>& column_families,
369                      std::vector<ColumnFamilyHandle*>* handles,
370                      TransactionDB** dbptr);
371   // Note: PrepareWrap() may change parameters, make copies before the
372   // invocation if needed.
373   static void PrepareWrap(DBOptions* db_options,
374                           std::vector<ColumnFamilyDescriptor>* column_families,
375                           std::vector<size_t>* compaction_enabled_cf_indices);
376   // If the return status is not ok, then dbptr will bet set to nullptr. The
377   // input db parameter might or might not be deleted as a result of the
378   // failure. If it is properly deleted it will be set to nullptr. If the return
379   // status is ok, the ownership of db is transferred to dbptr.
380   static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options,
381                        const std::vector<size_t>& compaction_enabled_cf_indices,
382                        const std::vector<ColumnFamilyHandle*>& handles,
383                        TransactionDB** dbptr);
384   // If the return status is not ok, then dbptr will bet set to nullptr. The
385   // input db parameter might or might not be deleted as a result of the
386   // failure. If it is properly deleted it will be set to nullptr. If the return
387   // status is ok, the ownership of db is transferred to dbptr.
388   static Status WrapStackableDB(
389       StackableDB* db, const TransactionDBOptions& txn_db_options,
390       const std::vector<size_t>& compaction_enabled_cf_indices,
391       const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr);
392   // Since the destructor in StackableDB is virtual, this destructor is virtual
393   // too. The root db will be deleted by the base's destructor.
~TransactionDB()394   ~TransactionDB() override {}
395 
396   // Starts a new Transaction.
397   //
398   // Caller is responsible for deleting the returned transaction when no
399   // longer needed.
400   //
401   // If old_txn is not null, BeginTransaction will reuse this Transaction
402   // handle instead of allocating a new one.  This is an optimization to avoid
403   // extra allocations when repeatedly creating transactions.
404   virtual Transaction* BeginTransaction(
405       const WriteOptions& write_options,
406       const TransactionOptions& txn_options = TransactionOptions(),
407       Transaction* old_txn = nullptr) = 0;
408 
409   virtual Transaction* GetTransactionByName(const TransactionName& name) = 0;
410   virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0;
411 
412   // Returns set of all locks held.
413   //
414   // The mapping is column family id -> KeyLockInfo
415   virtual std::unordered_multimap<uint32_t, KeyLockInfo>
416   GetLockStatusData() = 0;
417 
418   virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
419   virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
420 
421  protected:
422   // To Create an TransactionDB, call Open()
423   // The ownership of db is transferred to the base StackableDB
TransactionDB(DB * db)424   explicit TransactionDB(DB* db) : StackableDB(db) {}
425   // No copying allowed
426   TransactionDB(const TransactionDB&) = delete;
427   void operator=(const TransactionDB&) = delete;
428 };
429 
430 }  // namespace ROCKSDB_NAMESPACE
431 
432 #endif  // ROCKSDB_LITE
433