1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of PerconaFT.
6 
7 
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9 
10     PerconaFT is free software: you can redistribute it and/or modify
11     it under the terms of the GNU General Public License, version 2,
12     as published by the Free Software Foundation.
13 
14     PerconaFT is distributed in the hope that it will be useful,
15     but WITHOUT ANY WARRANTY; without even the implied warranty of
16     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17     GNU General Public License for more details.
18 
19     You should have received a copy of the GNU General Public License
20     along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
21 
22 ----------------------------------------
23 
24     PerconaFT is free software: you can redistribute it and/or modify
25     it under the terms of the GNU Affero General Public License, version 3,
26     as published by the Free Software Foundation.
27 
28     PerconaFT is distributed in the hope that it will be useful,
29     but WITHOUT ANY WARRANTY; without even the implied warranty of
30     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
31     GNU Affero General Public License for more details.
32 
33     You should have received a copy of the GNU Affero General Public License
34     along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
35 
36 ----------------------------------------
37 
38    Licensed under the Apache License, Version 2.0 (the "License");
39    you may not use this file except in compliance with the License.
40    You may obtain a copy of the License at
41 
42        http://www.apache.org/licenses/LICENSE-2.0
43 
44    Unless required by applicable law or agreed to in writing, software
45    distributed under the License is distributed on an "AS IS" BASIS,
46    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
47    See the License for the specific language governing permissions and
48    limitations under the License.
49 ======= */
50 
51 #ident \
52     "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
53 
54 #pragma once
55 
56 #include <atomic>
57 
58 #include "../db.h"
59 #include "../ft/comparator.h"
60 #include "../portability/toku_external_pthread.h"
61 #include "../portability/toku_pthread.h"
62 #include "../portability/toku_time.h"
63 // PORT #include <ft/ft-ops.h>  // just for DICTIONARY_ID..
64 // PORT: ft-status for LTM_STATUS:
65 #include "../ft/ft-status.h"
66 
67 struct DICTIONARY_ID {
68   uint64_t dictid;
69 };
70 
71 #include "../util/omt.h"
72 #include "range_buffer.h"
73 #include "txnid_set.h"
74 #include "wfg.h"
75 
76 namespace toku {
77 
78 class locktree;
79 class locktree_manager;
80 class lock_request;
81 class concurrent_tree;
82 
83 typedef int (*lt_create_cb)(locktree *lt, void *extra);
84 typedef void (*lt_destroy_cb)(locktree *lt);
85 typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt,
86                                const range_buffer &buffer, void *extra);
87 
88 struct lt_counters {
89   uint64_t wait_count, wait_time;
90   uint64_t long_wait_count, long_wait_time;
91   uint64_t timeout_count;
92 
addlt_counters93   void add(const lt_counters &rhs) {
94     wait_count += rhs.wait_count;
95     wait_time += rhs.wait_time;
96     long_wait_count += rhs.long_wait_count;
97     long_wait_time += rhs.long_wait_time;
98     timeout_count += rhs.timeout_count;
99   }
100 };
101 
102 // Lock request state for some locktree
103 struct lt_lock_request_info {
104   omt<lock_request *> pending_lock_requests;
105   std::atomic_bool pending_is_empty;
106   toku_external_mutex_t mutex;
107   bool should_retry_lock_requests;
108   lt_counters counters;
109   std::atomic_ullong retry_want;
110   unsigned long long retry_done;
111   toku_mutex_t retry_mutex;
112   toku_cond_t retry_cv;
113   bool running_retry;
114 
115   void init(toku_external_mutex_factory_t mutex_factory);
116   void destroy(void);
117 };
118 
119 // The locktree manager manages a set of locktrees, one for each open
120 // dictionary. Locktrees are retrieved from the manager. When they are no
121 // longer needed, they are be released by the user.
122 class locktree_manager {
123  public:
124   // param: create_cb, called just after a locktree is first created.
125   //        destroy_cb, called just before a locktree is destroyed.
126   //        escalate_cb, called after a locktree is escalated (with extra
127   //        param)
128   void create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
129               lt_escalate_cb escalate_cb, void *extra,
130               toku_external_mutex_factory_t mutex_factory_arg);
131 
132   void destroy(void);
133 
134   size_t get_max_lock_memory(void);
135 
136   int set_max_lock_memory(size_t max_lock_memory);
137 
138   // effect: Get a locktree from the manager. If a locktree exists with the
139   // given
140   //         dict_id, it is referenced and then returned. If one did not exist,
141   //         it is created. It will use the comparator for comparing keys. The
142   //         on_create callback (passed to locktree_manager::create()) will be
143   //         called with the given extra parameter.
144   locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
145                    void *on_create_extra);
146 
147   void reference_lt(locktree *lt);
148 
149   // effect: Releases one reference on a locktree. If the reference count
150   // transitions
151   //         to zero, the on_destroy callback is called before it gets
152   //         destroyed.
153   void release_lt(locktree *lt);
154 
155   void get_status(LTM_STATUS status);
156 
157   // effect: calls the iterate function on each pending lock request
158   // note: holds the manager's mutex
159   typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id,
160                                                TXNID txnid, const DBT *left_key,
161                                                const DBT *right_key,
162                                                TXNID blocking_txnid,
163                                                uint64_t start_time,
164                                                void *extra);
165   int iterate_pending_lock_requests(lock_request_iterate_callback cb,
166                                     void *extra);
167 
168   // effect: Determines if too many locks or too much memory is being used,
169   //         Runs escalation on the manager if so.
170   // param: big_txn, if the current transaction is 'big' (has spilled rollback
171   // logs) returns: 0 if there enough resources to create a new lock, or
172   // TOKUDB_OUT_OF_LOCKS
173   //          if there are not enough resources and lock escalation failed to
174   //          free up enough resources for a new lock.
175   int check_current_lock_constraints(bool big_txn);
176 
177   bool over_big_threshold(void);
178 
179   void note_mem_used(uint64_t mem_used);
180 
181   void note_mem_released(uint64_t mem_freed);
182 
183   bool out_of_locks(void) const;
184 
185   // Escalate all locktrees
186   void escalate_all_locktrees(void);
187 
188   // Escalate a set of locktrees
189   void escalate_locktrees(locktree **locktrees, int num_locktrees);
190 
191   // effect: calls the private function run_escalation(), only ok to
192   //         do for tests.
193   // rationale: to get better stress test coverage, we want a way to
194   //            deterministicly trigger lock escalation.
195   void run_escalation_for_test(void);
196   void run_escalation(void);
197 
198   // Add time t to the escalator's wait time statistics
199   void add_escalator_wait_time(uint64_t t);
200 
201   void kill_waiter(void *extra);
202 
203  private:
204   static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
205 
206   // tracks the current number of locks and lock memory
207   uint64_t m_max_lock_memory;
208   uint64_t m_current_lock_memory;
209 
210   struct lt_counters m_lt_counters;
211 
212   // the create and destroy callbacks for the locktrees
213   lt_create_cb m_lt_create_callback;
214   lt_destroy_cb m_lt_destroy_callback;
215   lt_escalate_cb m_lt_escalate_callback;
216   void *m_lt_escalate_callback_extra;
217 
218   omt<locktree *> m_locktree_map;
219 
220   toku_external_mutex_factory_t mutex_factory;
221 
222   // the manager's mutex protects the locktree map
223   toku_mutex_t m_mutex;
224 
225   void mutex_lock(void);
226 
227   void mutex_unlock(void);
228 
229   // Manage the set of open locktrees
230   locktree *locktree_map_find(const DICTIONARY_ID &dict_id);
231   void locktree_map_put(locktree *lt);
232   void locktree_map_remove(locktree *lt);
233 
234   static int find_by_dict_id(locktree *const &lt, const DICTIONARY_ID &dict_id);
235 
236   void escalator_init(void);
237   void escalator_destroy(void);
238 
239   // statistics about lock escalation.
240   toku_mutex_t m_escalation_mutex;
241   uint64_t m_escalation_count;
242   tokutime_t m_escalation_time;
243   uint64_t m_escalation_latest_result;
244   uint64_t m_wait_escalation_count;
245   uint64_t m_wait_escalation_time;
246   uint64_t m_long_wait_escalation_count;
247   uint64_t m_long_wait_escalation_time;
248 
249   // the escalator coordinates escalation on a set of locktrees for a bunch of
250   // threads
251   class locktree_escalator {
252    public:
253     void create(void);
254     void destroy(void);
255     void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
256              void *extra);
257 
258    private:
259     toku_mutex_t m_escalator_mutex;
260     toku_cond_t m_escalator_done;
261     bool m_escalator_running;
262   };
263 
264   locktree_escalator m_escalator;
265 
266   friend class manager_unit_test;
267 };
268 
269 // A locktree represents the set of row locks owned by all transactions
270 // over an open dictionary. Read and write ranges are represented as
271 // a left and right key which are compared with the given comparator
272 //
273 // Locktrees are not created and destroyed by the user. Instead, they are
274 // referenced and released using the locktree manager.
275 //
276 // A sample workflow looks like this:
277 // - Create a manager.
278 // - Get a locktree by dictionaroy id from the manager.
279 // - Perform read/write lock acquision on the locktree, add references to
280 //   the locktree using the manager, release locks, release references, etc.
281 // - ...
282 // - Release the final reference to the locktree. It will be destroyed.
283 // - Destroy the manager.
284 class locktree {
285  public:
286   // effect: Creates a locktree
287   void create(locktree_manager *mgr, DICTIONARY_ID dict_id,
288               const comparator &cmp,
289               toku_external_mutex_factory_t mutex_factory);
290 
291   void destroy(void);
292 
293   // For thread-safe, external reference counting
294   void add_reference(void);
295 
296   // requires: the reference count is > 0
297   // returns: the reference count, after decrementing it by one
298   uint32_t release_reference(void);
299 
300   // returns: the current reference count
301   uint32_t get_reference_count(void);
302 
303   // effect: Attempts to grant a read lock for the range of keys between
304   // [left_key, right_key]. returns: If the lock cannot be granted, return
305   // DB_LOCK_NOTGRANTED, and populate the
306   //          given conflicts set with the txnids that hold conflicting locks in
307   //          the range. If the locktree cannot create more locks, return
308   //          TOKUDB_OUT_OF_LOCKS.
309   // note: Read locks cannot be shared between txnids, as one would expect.
310   //       This is for simplicity since read locks are rare in MySQL.
311   int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
312                         txnid_set *conflicts, bool big_txn);
313 
314   // effect: Attempts to grant a write lock for the range of keys between
315   // [left_key, right_key]. returns: If the lock cannot be granted, return
316   // DB_LOCK_NOTGRANTED, and populate the
317   //          given conflicts set with the txnids that hold conflicting locks in
318   //          the range. If the locktree cannot create more locks, return
319   //          TOKUDB_OUT_OF_LOCKS.
320   int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
321                          txnid_set *conflicts, bool big_txn);
322 
323   // effect: populate the conflicts set with the txnids that would preventing
324   //         the given txnid from getting a lock on [left_key, right_key]
325   void get_conflicts(bool is_write_request, TXNID txnid, const DBT *left_key,
326                      const DBT *right_key, txnid_set *conflicts);
327 
328   // effect: Release all of the lock ranges represented by the range buffer for
329   // a txnid.
330   void release_locks(TXNID txnid, const range_buffer *ranges,
331                      bool all_trx_locks_hint = false);
332 
333   // effect: Runs escalation on this locktree
334   void escalate(lt_escalate_cb after_escalate_callback, void *extra);
335 
336   // returns: The userdata associated with this locktree, or null if it has not
337   // been set.
338   void *get_userdata(void) const;
339 
340   void set_userdata(void *userdata);
341 
342   locktree_manager *get_manager(void) const;
343 
344   void set_comparator(const comparator &cmp);
345 
346   int compare(const locktree *lt) const;
347 
348   DICTIONARY_ID get_dict_id() const;
349 
350   // Private info struct for storing pending lock request state.
351   // Only to be used by lock requests. We store it here as
352   // something less opaque than usual to strike a tradeoff between
353   // abstraction and code complexity. It is still fairly abstract
354   // since the lock_request object is opaque
355   struct lt_lock_request_info *get_lock_request_info(void);
356 
357   typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
358                                 TXNID txnid, bool is_shared,
359                                 TxnidVector *owners);
360   void dump_locks(void *cdata, dump_callback cb);
361 
362  private:
363   locktree_manager *m_mgr;
364   DICTIONARY_ID m_dict_id;
365   uint32_t m_reference_count;
366 
367   // Since the memory referenced by this comparator is not owned by the
368   // locktree, the user must guarantee it will outlive the locktree.
369   //
370   // The ydb API accomplishes this by opening an ft_handle in the on_create
371   // callback, which will keep the underlying FT (and its descriptor) in memory
372   // for as long as the handle is open. The ft_handle is stored opaquely in the
373   // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra
374   comparator m_cmp;
375 
376   concurrent_tree *m_rangetree;
377 
378   void *m_userdata;
379   struct lt_lock_request_info m_lock_request_info;
380 
381   // psergey-todo:
382   //  Each transaction also keeps a list of ranges it has locked.
383   //  So, when a transaction is running in STO mode, two identical
384   //  lists are kept: the STO lock list and transaction's owned locks
385   //  list. Why can't we do with just one list?
386 
387   // The following fields and members prefixed with "sto_" are for
388   // the single txnid optimization, intended to speed up the case
389   // when only one transaction is using the locktree. If we know
390   // the locktree has only one transaction, then acquiring locks
391   // takes O(1) work and releasing all locks takes O(1) work.
392   //
393   // How do we know that the locktree only has a single txnid?
394   // What do we do if it does?
395   //
396   // When a txn with txnid T requests a lock:
397   // - If the tree is empty, the optimization is possible. Set the single
398   // txnid to T, and insert the lock range into the buffer.
399   // - If the tree is not empty, check if the single txnid is T. If so,
400   // append the lock range to the buffer. Otherwise, migrate all of
401   // the locks in the buffer into the rangetree on behalf of txnid T,
402   // and invalid the single txnid.
403   //
404   // When a txn with txnid T releases its locks:
405   // - If the single txnid is valid, it must be for T. Destroy the buffer.
406   // - If it's not valid, release locks the normal way in the rangetree.
407   //
408   // To carry out the optimization we need to record a single txnid
409   // and a range buffer for each locktree, each protected by the root
410   // lock of the locktree's rangetree. The root lock for a rangetree
411   // is grabbed by preparing a locked keyrange on the rangetree.
412   TXNID m_sto_txnid;
413   range_buffer m_sto_buffer;
414 
415   // The single txnid optimization speeds up the case when only one
416   // transaction is using the locktree. But it has the potential to
417   // hurt the case when more than one txnid exists.
418   //
419   // There are two things we need to do to make the optimization only
420   // optimize the case we care about, and not hurt the general case.
421   //
422   // Bound the worst-case latency for lock migration when the
423   // optimization stops working:
424   // - Idea: Stop the optimization and migrate immediate if we notice
425   // the single txnid has takes many locks in the range buffer.
426   // - Implementation: Enforce a max size on the single txnid range buffer.
427   // - Analysis: Choosing the perfect max value, M, is difficult to do
428   // without some feedback from the field. Intuition tells us that M should
429   // not be so small that the optimization is worthless, and it should not
430   // be so big that it's unreasonable to have to wait behind a thread doing
431   // the work of converting M buffer locks into rangetree locks.
432   //
433   // Prevent concurrent-transaction workloads from trying the optimization
434   // in vain:
435   // - Idea: Don't even bother trying the optimization if we think the
436   // system is in a concurrent-transaction state.
437   // - Implementation: Do something even simpler than detecting whether the
438   // system is in a concurent-transaction state. Just keep a "score" value
439   // and some threshold. If at any time the locktree is eligible for the
440   // optimization, only do it if the score is at this threshold. When you
441   // actually do the optimization but someone has to migrate locks in the buffer
442   // (expensive), then reset the score back to zero. Each time a txn
443   // releases locks, the score is incremented by 1.
444   // - Analysis: If you let the threshold be "C", then at most 1 / C txns will
445   // do the optimization in a concurrent-transaction system. Similarly, it
446   // takes at most C txns to start using the single txnid optimzation, which
447   // is good when the system transitions from multithreaded to single threaded.
448   //
449   // STO_BUFFER_MAX_SIZE:
450   //
451   // We choose the max value to be 1 million since most transactions are smaller
452   // than 1 million and we can create a rangetree of 1 million elements in
453   // less than a second. So we can be pretty confident that this threshold
454   // enables the optimization almost always, and prevents super pathological
455   // latency issues for the first lock taken by a second thread.
456   //
457   // STO_SCORE_THRESHOLD:
458   //
459   // A simple first guess at a good value for the score threshold is 100.
460   // By our analysis, we'd end up doing the optimization in vain for
461   // around 1% of all transactions, which seems reasonable. Further,
462   // if the system goes single threaded, it ought to be pretty quick
463   // for 100 transactions to go by, so we won't have to wait long before
464   // we start doing the single txind optimzation again.
465   static const int STO_BUFFER_MAX_SIZE = 50 * 1024;
466   static const int STO_SCORE_THRESHOLD = 100;
467   int m_sto_score;
468 
469   // statistics about time spent ending the STO early
470   uint64_t m_sto_end_early_count;
471   tokutime_t m_sto_end_early_time;
472 
473   // effect: begins the single txnid optimizaiton, setting m_sto_txnid
474   //         to the given txnid.
475   // requires: m_sto_txnid is invalid
476   void sto_begin(TXNID txnid);
477 
478   // effect: append a range to the sto buffer
479   // requires: m_sto_txnid is valid
480   void sto_append(const DBT *left_key, const DBT *right_key,
481                   bool is_write_request);
482 
483   // effect: ends the single txnid optimization, releaseing any memory
484   //         stored in the sto buffer, notifying the tracker, and
485   //         invalidating m_sto_txnid.
486   // requires: m_sto_txnid is valid
487   void sto_end(void);
488 
489   // params: prepared_lkr is a void * to a prepared locked keyrange. see below.
490   // effect: ends the single txnid optimization early, migrating buffer locks
491   //         into the rangetree, calling sto_end(), and then setting the
492   //         sto_score back to zero.
493   // requires: m_sto_txnid is valid
494   void sto_end_early(void *prepared_lkr);
495   void sto_end_early_no_accounting(void *prepared_lkr);
496 
497   // params: prepared_lkr is a void * to a prepared locked keyrange. we can't
498   // use
499   //         the real type because the compiler won't allow us to forward
500   //         declare concurrent_tree::locked_keyrange without including
501   //         concurrent_tree.h, which we cannot do here because it is a template
502   //         implementation.
503   // requires: the prepared locked keyrange is for the locktree's rangetree
504   // requires: m_sto_txnid is valid
505   // effect: migrates each lock in the single txnid buffer into the locktree's
506   //         rangetree, notifying the memory tracker as necessary.
507   void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr);
508 
509   // effect: If m_sto_txnid is valid, then release the txnid's locks
510   //         by ending the optimization.
511   // requires: If m_sto_txnid is valid, it is equal to the given txnid
512   // returns: True if locks were released for this txnid
513   bool sto_try_release(TXNID txnid);
514 
515   // params: prepared_lkr is a void * to a prepared locked keyrange. see above.
516   // requires: the prepared locked keyrange is for the locktree's rangetree
517   // effect: If m_sto_txnid is valid and equal to the given txnid, then
518   // append a range onto the buffer. Otherwise, if m_sto_txnid is valid
519   //        but not equal to this txnid, then migrate the buffer's locks
520   //        into the rangetree and end the optimization, setting the score
521   //        back to zero.
522   // returns: true if the lock was acquired for this txnid
523   bool sto_try_acquire(void *prepared_lkr, TXNID txnid, const DBT *left_key,
524                        const DBT *right_key, bool is_write_request);
525 
526   // Effect:
527   //  Provides a hook for a helgrind suppression.
528   // Returns:
529   //  true if m_sto_txnid is not TXNID_NONE
530   bool sto_txnid_is_valid_unsafe(void) const;
531 
532   // Effect:
533   //  Provides a hook for a helgrind suppression.
534   // Returns:
535   //  m_sto_score
536   int sto_get_score_unsafe(void) const;
537 
538   void remove_overlapping_locks_for_txnid(TXNID txnid, const DBT *left_key,
539                                           const DBT *right_key);
540 
541   int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
542                                 const DBT *left_key, const DBT *right_key,
543                                 bool is_write_request, txnid_set *conflicts);
544 
545   int acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
546                    const DBT *right_key, txnid_set *conflicts);
547 
548   int try_acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
549                        const DBT *right_key, txnid_set *conflicts,
550                        bool big_txn);
551 
552   friend class locktree_unit_test;
553   friend class manager_unit_test;
554   friend class lock_request_unit_test;
555 
556   // engine status reaches into the locktree to read some stats
557   friend void locktree_manager::get_status(LTM_STATUS status);
558 };
559 
560 } /* namespace toku */
561