1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of PerconaFT.
6 
7 
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9 
10     PerconaFT is free software: you can redistribute it and/or modify
11     it under the terms of the GNU General Public License, version 2,
12     as published by the Free Software Foundation.
13 
14     PerconaFT is distributed in the hope that it will be useful,
15     but WITHOUT ANY WARRANTY; without even the implied warranty of
16     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17     GNU General Public License for more details.
18 
19     You should have received a copy of the GNU General Public License
20     along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
21 
22 ----------------------------------------
23 
24     PerconaFT is free software: you can redistribute it and/or modify
25     it under the terms of the GNU Affero General Public License, version 3,
26     as published by the Free Software Foundation.
27 
28     PerconaFT is distributed in the hope that it will be useful,
29     but WITHOUT ANY WARRANTY; without even the implied warranty of
30     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
31     GNU Affero General Public License for more details.
32 
33     You should have received a copy of the GNU Affero General Public License
34     along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
35 
36 ----------------------------------------
37 
38    Licensed under the Apache License, Version 2.0 (the "License");
39    you may not use this file except in compliance with the License.
40    You may obtain a copy of the License at
41 
42        http://www.apache.org/licenses/LICENSE-2.0
43 
44    Unless required by applicable law or agreed to in writing, software
45    distributed under the License is distributed on an "AS IS" BASIS,
46    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
47    See the License for the specific language governing permissions and
48    limitations under the License.
49 ======= */
50 
51 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
52 
53 #pragma once
54 
55 #include <atomic>
56 
57 #include <db.h>
58 #include <toku_pthread.h>
59 #include <toku_time.h>
60 
61 #include <ft/comparator.h>
62 #include <ft/ft-ops.h>  // just for DICTIONARY_ID..
63 
64 #include <util/omt.h>
65 
66 #include "txnid_set.h"
67 #include "wfg.h"
68 #include "range_buffer.h"
69 
70 
71 namespace toku {
72 
73     class locktree;
74     class locktree_manager;
75     class lock_request;
76     class concurrent_tree;
77 
78     typedef int  (*lt_create_cb)(locktree *lt, void *extra);
79     typedef void (*lt_destroy_cb)(locktree *lt);
80     typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt, const range_buffer &buffer, void *extra);
81 
82     struct lt_counters {
83         uint64_t wait_count, wait_time;
84         uint64_t long_wait_count, long_wait_time;
85         uint64_t timeout_count;
86 
addlt_counters87         void add(const lt_counters &rhs) {
88             wait_count += rhs.wait_count;
89             wait_time += rhs.wait_time;
90             long_wait_count += rhs.long_wait_count;
91             long_wait_time += rhs.long_wait_time;
92             timeout_count += rhs.timeout_count;
93         }
94     };
95 
96     // Lock request state for some locktree
97     struct lt_lock_request_info {
98         omt<lock_request *> pending_lock_requests;
99         std::atomic_bool pending_is_empty;
100         toku_mutex_t mutex;
101         bool should_retry_lock_requests;
102         lt_counters counters;
103         std::atomic_ullong retry_want;
104         unsigned long long retry_done;
105         toku_mutex_t retry_mutex;
106         toku_cond_t retry_cv;
107         bool running_retry;
108 
109         void init(void);
110         void destroy(void);
111     };
112 
113     // The locktree manager manages a set of locktrees, one for each open
114     // dictionary. Locktrees are retrieved from the manager. When they are no
115     // longer needed, they are be released by the user.
116     class locktree_manager {
117        public:
118         // param: create_cb, called just after a locktree is first created.
119         //        destroy_cb, called just before a locktree is destroyed.
120         //        escalate_cb, called after a locktree is escalated (with extra
121         //        param)
122         void create(lt_create_cb create_cb,
123                     lt_destroy_cb destroy_cb,
124                     lt_escalate_cb escalate_cb,
125                     void *extra);
126 
127         void destroy(void);
128 
129         size_t get_max_lock_memory(void);
130 
131         int set_max_lock_memory(size_t max_lock_memory);
132 
133         // effect: Get a locktree from the manager. If a locktree exists with the given
134         //         dict_id, it is referenced and then returned. If one did not exist, it
135         //         is created. It will use the comparator for comparing keys. The on_create
136         //         callback (passed to locktree_manager::create()) will be called with the
137         //         given extra parameter.
138         locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp, void *on_create_extra);
139 
140         void reference_lt(locktree *lt);
141 
142         // effect: Releases one reference on a locktree. If the reference count transitions
143         //         to zero, the on_destroy callback is called before it gets destroyed.
144         void release_lt(locktree *lt);
145 
146         void get_status(LTM_STATUS status);
147 
148         // effect: calls the iterate function on each pending lock request
149         // note: holds the manager's mutex
150         typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id,
151                                                      TXNID txnid,
152                                                      const DBT *left_key,
153                                                      const DBT *right_key,
154                                                      TXNID blocking_txnid,
155                                                      uint64_t start_time,
156                                                      void *extra);
157         int iterate_pending_lock_requests(lock_request_iterate_callback cb, void *extra);
158 
159         // effect: Determines if too many locks or too much memory is being used,
160         //         Runs escalation on the manager if so.
161         // param: big_txn, if the current transaction is 'big' (has spilled rollback logs)
162         // returns: 0 if there enough resources to create a new lock, or TOKUDB_OUT_OF_LOCKS
163         //          if there are not enough resources and lock escalation failed to free up
164         //          enough resources for a new lock.
165         int check_current_lock_constraints(bool big_txn);
166 
167         bool over_big_threshold(void);
168 
169         void note_mem_used(uint64_t mem_used);
170 
171         void note_mem_released(uint64_t mem_freed);
172 
173         bool out_of_locks(void) const;
174 
175         // Escalate all locktrees
176         void escalate_all_locktrees(void);
177 
178         // Escalate a set of locktrees
179         void escalate_locktrees(locktree **locktrees, int num_locktrees);
180 
181         // effect: calls the private function run_escalation(), only ok to
182         //         do for tests.
183         // rationale: to get better stress test coverage, we want a way to
184         //            deterministicly trigger lock escalation.
185         void run_escalation_for_test(void);
186         void run_escalation(void);
187 
188         // Add time t to the escalator's wait time statistics
189         void add_escalator_wait_time(uint64_t t);
190 
191         void kill_waiter(void *extra);
192 
193     private:
194         static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
195 
196         // tracks the current number of locks and lock memory
197         uint64_t m_max_lock_memory;
198         uint64_t m_current_lock_memory;
199 
200         struct lt_counters m_lt_counters;
201 
202         // the create and destroy callbacks for the locktrees
203         lt_create_cb m_lt_create_callback;
204         lt_destroy_cb m_lt_destroy_callback;
205         lt_escalate_cb m_lt_escalate_callback;
206         void *m_lt_escalate_callback_extra;
207 
208         omt<locktree *> m_locktree_map;
209 
210         // the manager's mutex protects the locktree map
211         toku_mutex_t m_mutex;
212 
213         void mutex_lock(void);
214 
215         void mutex_unlock(void);
216 
217         // Manage the set of open locktrees
218         locktree *locktree_map_find(const DICTIONARY_ID &dict_id);
219         void locktree_map_put(locktree *lt);
220         void locktree_map_remove(locktree *lt);
221 
222         static int find_by_dict_id(locktree *const &lt, const DICTIONARY_ID &dict_id);
223 
224         void escalator_init(void);
225         void escalator_destroy(void);
226 
227         // statistics about lock escalation.
228         toku_mutex_t m_escalation_mutex;
229         uint64_t m_escalation_count;
230         tokutime_t m_escalation_time;
231         uint64_t m_escalation_latest_result;
232         uint64_t m_wait_escalation_count;
233         uint64_t m_wait_escalation_time;
234         uint64_t m_long_wait_escalation_count;
235         uint64_t m_long_wait_escalation_time;
236 
237         // the escalator coordinates escalation on a set of locktrees for a bunch of threads
238         class locktree_escalator {
239         public:
240             void create(void);
241             void destroy(void);
242             void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra), void *extra);
243 
244         private:
245             toku_mutex_t m_escalator_mutex;
246             toku_cond_t m_escalator_done;
247             bool m_escalator_running;
248         };
249 
250         locktree_escalator m_escalator;
251 
252         friend class manager_unit_test;
253     };
254 
255     // A locktree represents the set of row locks owned by all transactions
256     // over an open dictionary. Read and write ranges are represented as
257     // a left and right key which are compared with the given comparator
258     //
259     // Locktrees are not created and destroyed by the user. Instead, they are
260     // referenced and released using the locktree manager.
261     //
262     // A sample workflow looks like this:
263     // - Create a manager.
264     // - Get a locktree by dictionaroy id from the manager.
265     // - Perform read/write lock acquision on the locktree, add references to
266     //   the locktree using the manager, release locks, release references, etc.
267     // - ...
268     // - Release the final reference to the locktree. It will be destroyed.
269     // - Destroy the manager.
270     class locktree {
271     public:
272         // effect: Creates a locktree
273         void create(locktree_manager *mgr, DICTIONARY_ID dict_id, const comparator &cmp);
274 
275         void destroy(void);
276 
277         // For thread-safe, external reference counting
278         void add_reference(void);
279 
280         // requires: the reference count is > 0
281         // returns: the reference count, after decrementing it by one
282         uint32_t release_reference(void);
283 
284         // returns: the current reference count
285         uint32_t get_reference_count(void);
286 
287         // effect: Attempts to grant a read lock for the range of keys between [left_key, right_key].
288         // returns: If the lock cannot be granted, return DB_LOCK_NOTGRANTED, and populate the
289         //          given conflicts set with the txnids that hold conflicting locks in the range.
290         //          If the locktree cannot create more locks, return TOKUDB_OUT_OF_LOCKS.
291         // note: Read locks cannot be shared between txnids, as one would expect.
292         //       This is for simplicity since read locks are rare in MySQL.
293         int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn);
294 
295         // effect: Attempts to grant a write lock for the range of keys between [left_key, right_key].
296         // returns: If the lock cannot be granted, return DB_LOCK_NOTGRANTED, and populate the
297         //          given conflicts set with the txnids that hold conflicting locks in the range.
298         //          If the locktree cannot create more locks, return TOKUDB_OUT_OF_LOCKS.
299         int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn);
300 
301         // effect: populate the conflicts set with the txnids that would preventing
302         //         the given txnid from getting a lock on [left_key, right_key]
303         void get_conflicts(bool is_write_request, TXNID txnid,
304                 const DBT *left_key, const DBT *right_key, txnid_set *conflicts);
305 
306         // effect: Release all of the lock ranges represented by the range buffer for a txnid.
307         void release_locks(TXNID txnid, const range_buffer *ranges);
308 
309         // effect: Runs escalation on this locktree
310         void escalate(lt_escalate_cb after_escalate_callback, void *extra);
311 
312         // returns: The userdata associated with this locktree, or null if it has not been set.
313         void *get_userdata(void) const;
314 
315         void set_userdata(void *userdata);
316 
317         locktree_manager *get_manager(void) const;
318 
319         void set_comparator(const comparator &cmp);
320 
321         int compare(const locktree *lt) const;
322 
323         DICTIONARY_ID get_dict_id() const;
324 
325         // Private info struct for storing pending lock request state.
326         // Only to be used by lock requests. We store it here as
327         // something less opaque than usual to strike a tradeoff between
328         // abstraction and code complexity. It is still fairly abstract
329         // since the lock_request object is opaque
330         struct lt_lock_request_info *get_lock_request_info(void);
331 
332     private:
333         locktree_manager *m_mgr;
334         DICTIONARY_ID m_dict_id;
335         uint32_t m_reference_count;
336 
337         // Since the memory referenced by this comparator is not owned by the
338         // locktree, the user must guarantee it will outlive the locktree.
339         //
340         // The ydb API accomplishes this by opening an ft_handle in the on_create
341         // callback, which will keep the underlying FT (and its descriptor) in memory
342         // for as long as the handle is open. The ft_handle is stored opaquely in the
343         // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra
344         comparator m_cmp;
345 
346         concurrent_tree *m_rangetree;
347 
348         void *m_userdata;
349         struct lt_lock_request_info m_lock_request_info;
350 
351         // The following fields and members prefixed with "sto_" are for
352         // the single txnid optimization, intended to speed up the case
353         // when only one transaction is using the locktree. If we know
354         // the locktree has only one transaction, then acquiring locks
355         // takes O(1) work and releasing all locks takes O(1) work.
356         //
357         // How do we know that the locktree only has a single txnid?
358         // What do we do if it does?
359         //
360         // When a txn with txnid T requests a lock:
361         // - If the tree is empty, the optimization is possible. Set the single
362         // txnid to T, and insert the lock range into the buffer.
363         // - If the tree is not empty, check if the single txnid is T. If so,
364         // append the lock range to the buffer. Otherwise, migrate all of
365         // the locks in the buffer into the rangetree on behalf of txnid T,
366         // and invalid the single txnid.
367         //
368         // When a txn with txnid T releases its locks:
369         // - If the single txnid is valid, it must be for T. Destroy the buffer.
370         // - If it's not valid, release locks the normal way in the rangetree.
371         //
372         // To carry out the optimization we need to record a single txnid
373         // and a range buffer for each locktree, each protected by the root
374         // lock of the locktree's rangetree. The root lock for a rangetree
375         // is grabbed by preparing a locked keyrange on the rangetree.
376         TXNID m_sto_txnid;
377         range_buffer m_sto_buffer;
378 
379         // The single txnid optimization speeds up the case when only one
380         // transaction is using the locktree. But it has the potential to
381         // hurt the case when more than one txnid exists.
382         //
383         // There are two things we need to do to make the optimization only
384         // optimize the case we care about, and not hurt the general case.
385         //
386         // Bound the worst-case latency for lock migration when the
387         // optimization stops working:
388         // - Idea: Stop the optimization and migrate immediate if we notice
389         // the single txnid has takes many locks in the range buffer.
390         // - Implementation: Enforce a max size on the single txnid range buffer.
391         // - Analysis: Choosing the perfect max value, M, is difficult to do
392         // without some feedback from the field. Intuition tells us that M should
393         // not be so small that the optimization is worthless, and it should not
394         // be so big that it's unreasonable to have to wait behind a thread doing
395         // the work of converting M buffer locks into rangetree locks.
396         //
397         // Prevent concurrent-transaction workloads from trying the optimization
398         // in vain:
399         // - Idea: Don't even bother trying the optimization if we think the
400         // system is in a concurrent-transaction state.
401         // - Implementation: Do something even simpler than detecting whether the
402         // system is in a concurent-transaction state. Just keep a "score" value
403         // and some threshold. If at any time the locktree is eligible for the
404         // optimization, only do it if the score is at this threshold. When you
405         // actually do the optimization but someone has to migrate locks in the buffer
406         // (expensive), then reset the score back to zero. Each time a txn
407         // releases locks, the score is incremented by 1.
408         // - Analysis: If you let the threshold be "C", then at most 1 / C txns will
409         // do the optimization in a concurrent-transaction system. Similarly, it
410         // takes at most C txns to start using the single txnid optimzation, which
411         // is good when the system transitions from multithreaded to single threaded.
412         //
413         // STO_BUFFER_MAX_SIZE:
414         //
415         // We choose the max value to be 1 million since most transactions are smaller
416         // than 1 million and we can create a rangetree of 1 million elements in
417         // less than a second. So we can be pretty confident that this threshold
418         // enables the optimization almost always, and prevents super pathological
419         // latency issues for the first lock taken by a second thread.
420         //
421         // STO_SCORE_THRESHOLD:
422         //
423         // A simple first guess at a good value for the score threshold is 100.
424         // By our analysis, we'd end up doing the optimization in vain for
425         // around 1% of all transactions, which seems reasonable. Further,
426         // if the system goes single threaded, it ought to be pretty quick
427         // for 100 transactions to go by, so we won't have to wait long before
428         // we start doing the single txind optimzation again.
429         static const int STO_BUFFER_MAX_SIZE = 50 * 1024;
430         static const int STO_SCORE_THRESHOLD = 100;
431         int m_sto_score;
432 
433         // statistics about time spent ending the STO early
434         uint64_t m_sto_end_early_count;
435         tokutime_t m_sto_end_early_time;
436 
437         // effect: begins the single txnid optimizaiton, setting m_sto_txnid
438         //         to the given txnid.
439         // requires: m_sto_txnid is invalid
440         void sto_begin(TXNID txnid);
441 
442         // effect: append a range to the sto buffer
443         // requires: m_sto_txnid is valid
444         void sto_append(const DBT *left_key, const DBT *right_key);
445 
446         // effect: ends the single txnid optimization, releaseing any memory
447         //         stored in the sto buffer, notifying the tracker, and
448         //         invalidating m_sto_txnid.
449         // requires: m_sto_txnid is valid
450         void sto_end(void);
451 
452         // params: prepared_lkr is a void * to a prepared locked keyrange. see below.
453         // effect: ends the single txnid optimization early, migrating buffer locks
454         //         into the rangetree, calling sto_end(), and then setting the
455         //         sto_score back to zero.
456         // requires: m_sto_txnid is valid
457         void sto_end_early(void *prepared_lkr);
458         void sto_end_early_no_accounting(void *prepared_lkr);
459 
460         // params: prepared_lkr is a void * to a prepared locked keyrange. we can't use
461         //         the real type because the compiler won't allow us to forward declare
462         //         concurrent_tree::locked_keyrange without including concurrent_tree.h,
463         //         which we cannot do here because it is a template implementation.
464         // requires: the prepared locked keyrange is for the locktree's rangetree
465         // requires: m_sto_txnid is valid
466         // effect: migrates each lock in the single txnid buffer into the locktree's
467         //         rangetree, notifying the memory tracker as necessary.
468         void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr);
469 
470         // effect: If m_sto_txnid is valid, then release the txnid's locks
471         //         by ending the optimization.
472         // requires: If m_sto_txnid is valid, it is equal to the given txnid
473         // returns: True if locks were released for this txnid
474         bool sto_try_release(TXNID txnid);
475 
476         // params: prepared_lkr is a void * to a prepared locked keyrange. see above.
477         // requires: the prepared locked keyrange is for the locktree's rangetree
478         // effect: If m_sto_txnid is valid and equal to the given txnid, then
479         // append a range onto the buffer. Otherwise, if m_sto_txnid is valid
480         //        but not equal to this txnid, then migrate the buffer's locks
481         //        into the rangetree and end the optimization, setting the score
482         //        back to zero.
483         // returns: true if the lock was acquired for this txnid
484         bool sto_try_acquire(void *prepared_lkr, TXNID txnid,
485                 const DBT *left_key, const DBT *right_key);
486 
487         // Effect:
488         //  Provides a hook for a helgrind suppression.
489         // Returns:
490         //  true if m_sto_txnid is not TXNID_NONE
491         bool sto_txnid_is_valid_unsafe(void) const;
492 
493         // Effect:
494         //  Provides a hook for a helgrind suppression.
495         // Returns:
496         //  m_sto_score
497         int sto_get_score_unsafe(void )const;
498 
499         void remove_overlapping_locks_for_txnid(TXNID txnid,
500                                                 const DBT *left_key, const DBT *right_key);
501 
502         int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
503                                       const DBT *left_key, const DBT *right_key,
504                                       txnid_set *conflicts);
505 
506         int acquire_lock(bool is_write_request, TXNID txnid,
507                          const DBT *left_key, const DBT *right_key,
508                          txnid_set *conflicts);
509 
510         int try_acquire_lock(bool is_write_request, TXNID txnid,
511                              const DBT *left_key, const DBT *right_key,
512                              txnid_set *conflicts, bool big_txn);
513 
514 
515         friend class locktree_unit_test;
516         friend class manager_unit_test;
517         friend class lock_request_unit_test;
518 
519         // engine status reaches into the locktree to read some stats
520         friend void locktree_manager::get_status(LTM_STATUS status);
521     };
522 
523 } /* namespace toku */
524