1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: 3 #ident "$Id$" 4 /*====== 5 This file is part of PerconaFT. 6 7 8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. 9 10 PerconaFT is free software: you can redistribute it and/or modify 11 it under the terms of the GNU General Public License, version 2, 12 as published by the Free Software Foundation. 13 14 PerconaFT is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. 21 22 ---------------------------------------- 23 24 PerconaFT is free software: you can redistribute it and/or modify 25 it under the terms of the GNU Affero General Public License, version 3, 26 as published by the Free Software Foundation. 27 28 PerconaFT is distributed in the hope that it will be useful, 29 but WITHOUT ANY WARRANTY; without even the implied warranty of 30 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 31 GNU Affero General Public License for more details. 32 33 You should have received a copy of the GNU Affero General Public License 34 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. 35 36 ---------------------------------------- 37 38 Licensed under the Apache License, Version 2.0 (the "License"); 39 you may not use this file except in compliance with the License. 40 You may obtain a copy of the License at 41 42 http://www.apache.org/licenses/LICENSE-2.0 43 44 Unless required by applicable law or agreed to in writing, software 45 distributed under the License is distributed on an "AS IS" BASIS, 46 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 47 See the License for the specific language governing permissions and 48 limitations under the License. 49 ======= */ 50 51 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." 52 53 #pragma once 54 55 #include <atomic> 56 57 #include <db.h> 58 #include <toku_pthread.h> 59 #include <toku_time.h> 60 61 #include <ft/comparator.h> 62 #include <ft/ft-ops.h> // just for DICTIONARY_ID.. 63 64 #include <util/omt.h> 65 66 #include "txnid_set.h" 67 #include "wfg.h" 68 #include "range_buffer.h" 69 70 71 namespace toku { 72 73 class locktree; 74 class locktree_manager; 75 class lock_request; 76 class concurrent_tree; 77 78 typedef int (*lt_create_cb)(locktree *lt, void *extra); 79 typedef void (*lt_destroy_cb)(locktree *lt); 80 typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt, const range_buffer &buffer, void *extra); 81 82 struct lt_counters { 83 uint64_t wait_count, wait_time; 84 uint64_t long_wait_count, long_wait_time; 85 uint64_t timeout_count; 86 addlt_counters87 void add(const lt_counters &rhs) { 88 wait_count += rhs.wait_count; 89 wait_time += rhs.wait_time; 90 long_wait_count += rhs.long_wait_count; 91 long_wait_time += rhs.long_wait_time; 92 timeout_count += rhs.timeout_count; 93 } 94 }; 95 96 // Lock request state for some locktree 97 struct lt_lock_request_info { 98 omt<lock_request *> pending_lock_requests; 99 std::atomic_bool pending_is_empty; 100 toku_mutex_t mutex; 101 bool should_retry_lock_requests; 102 lt_counters counters; 103 std::atomic_ullong retry_want; 104 unsigned long long retry_done; 105 toku_mutex_t retry_mutex; 106 toku_cond_t retry_cv; 107 bool running_retry; 108 109 void init(void); 110 void destroy(void); 111 }; 112 113 // The locktree manager manages a set of locktrees, one for each open 114 // dictionary. Locktrees are retrieved from the manager. When they are no 115 // longer needed, they are be released by the user. 116 class locktree_manager { 117 public: 118 // param: create_cb, called just after a locktree is first created. 119 // destroy_cb, called just before a locktree is destroyed. 120 // escalate_cb, called after a locktree is escalated (with extra 121 // param) 122 void create(lt_create_cb create_cb, 123 lt_destroy_cb destroy_cb, 124 lt_escalate_cb escalate_cb, 125 void *extra); 126 127 void destroy(void); 128 129 size_t get_max_lock_memory(void); 130 131 int set_max_lock_memory(size_t max_lock_memory); 132 133 // effect: Get a locktree from the manager. If a locktree exists with the given 134 // dict_id, it is referenced and then returned. If one did not exist, it 135 // is created. It will use the comparator for comparing keys. The on_create 136 // callback (passed to locktree_manager::create()) will be called with the 137 // given extra parameter. 138 locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp, void *on_create_extra); 139 140 void reference_lt(locktree *lt); 141 142 // effect: Releases one reference on a locktree. If the reference count transitions 143 // to zero, the on_destroy callback is called before it gets destroyed. 144 void release_lt(locktree *lt); 145 146 void get_status(LTM_STATUS status); 147 148 // effect: calls the iterate function on each pending lock request 149 // note: holds the manager's mutex 150 typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id, 151 TXNID txnid, 152 const DBT *left_key, 153 const DBT *right_key, 154 TXNID blocking_txnid, 155 uint64_t start_time, 156 void *extra); 157 int iterate_pending_lock_requests(lock_request_iterate_callback cb, void *extra); 158 159 // effect: Determines if too many locks or too much memory is being used, 160 // Runs escalation on the manager if so. 161 // param: big_txn, if the current transaction is 'big' (has spilled rollback logs) 162 // returns: 0 if there enough resources to create a new lock, or TOKUDB_OUT_OF_LOCKS 163 // if there are not enough resources and lock escalation failed to free up 164 // enough resources for a new lock. 165 int check_current_lock_constraints(bool big_txn); 166 167 bool over_big_threshold(void); 168 169 void note_mem_used(uint64_t mem_used); 170 171 void note_mem_released(uint64_t mem_freed); 172 173 bool out_of_locks(void) const; 174 175 // Escalate all locktrees 176 void escalate_all_locktrees(void); 177 178 // Escalate a set of locktrees 179 void escalate_locktrees(locktree **locktrees, int num_locktrees); 180 181 // effect: calls the private function run_escalation(), only ok to 182 // do for tests. 183 // rationale: to get better stress test coverage, we want a way to 184 // deterministicly trigger lock escalation. 185 void run_escalation_for_test(void); 186 void run_escalation(void); 187 188 // Add time t to the escalator's wait time statistics 189 void add_escalator_wait_time(uint64_t t); 190 191 void kill_waiter(void *extra); 192 193 private: 194 static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024; 195 196 // tracks the current number of locks and lock memory 197 uint64_t m_max_lock_memory; 198 uint64_t m_current_lock_memory; 199 200 struct lt_counters m_lt_counters; 201 202 // the create and destroy callbacks for the locktrees 203 lt_create_cb m_lt_create_callback; 204 lt_destroy_cb m_lt_destroy_callback; 205 lt_escalate_cb m_lt_escalate_callback; 206 void *m_lt_escalate_callback_extra; 207 208 omt<locktree *> m_locktree_map; 209 210 // the manager's mutex protects the locktree map 211 toku_mutex_t m_mutex; 212 213 void mutex_lock(void); 214 215 void mutex_unlock(void); 216 217 // Manage the set of open locktrees 218 locktree *locktree_map_find(const DICTIONARY_ID &dict_id); 219 void locktree_map_put(locktree *lt); 220 void locktree_map_remove(locktree *lt); 221 222 static int find_by_dict_id(locktree *const <, const DICTIONARY_ID &dict_id); 223 224 void escalator_init(void); 225 void escalator_destroy(void); 226 227 // statistics about lock escalation. 228 toku_mutex_t m_escalation_mutex; 229 uint64_t m_escalation_count; 230 tokutime_t m_escalation_time; 231 uint64_t m_escalation_latest_result; 232 uint64_t m_wait_escalation_count; 233 uint64_t m_wait_escalation_time; 234 uint64_t m_long_wait_escalation_count; 235 uint64_t m_long_wait_escalation_time; 236 237 // the escalator coordinates escalation on a set of locktrees for a bunch of threads 238 class locktree_escalator { 239 public: 240 void create(void); 241 void destroy(void); 242 void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra), void *extra); 243 244 private: 245 toku_mutex_t m_escalator_mutex; 246 toku_cond_t m_escalator_done; 247 bool m_escalator_running; 248 }; 249 250 locktree_escalator m_escalator; 251 252 friend class manager_unit_test; 253 }; 254 255 // A locktree represents the set of row locks owned by all transactions 256 // over an open dictionary. Read and write ranges are represented as 257 // a left and right key which are compared with the given comparator 258 // 259 // Locktrees are not created and destroyed by the user. Instead, they are 260 // referenced and released using the locktree manager. 261 // 262 // A sample workflow looks like this: 263 // - Create a manager. 264 // - Get a locktree by dictionaroy id from the manager. 265 // - Perform read/write lock acquision on the locktree, add references to 266 // the locktree using the manager, release locks, release references, etc. 267 // - ... 268 // - Release the final reference to the locktree. It will be destroyed. 269 // - Destroy the manager. 270 class locktree { 271 public: 272 // effect: Creates a locktree 273 void create(locktree_manager *mgr, DICTIONARY_ID dict_id, const comparator &cmp); 274 275 void destroy(void); 276 277 // For thread-safe, external reference counting 278 void add_reference(void); 279 280 // requires: the reference count is > 0 281 // returns: the reference count, after decrementing it by one 282 uint32_t release_reference(void); 283 284 // returns: the current reference count 285 uint32_t get_reference_count(void); 286 287 // effect: Attempts to grant a read lock for the range of keys between [left_key, right_key]. 288 // returns: If the lock cannot be granted, return DB_LOCK_NOTGRANTED, and populate the 289 // given conflicts set with the txnids that hold conflicting locks in the range. 290 // If the locktree cannot create more locks, return TOKUDB_OUT_OF_LOCKS. 291 // note: Read locks cannot be shared between txnids, as one would expect. 292 // This is for simplicity since read locks are rare in MySQL. 293 int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn); 294 295 // effect: Attempts to grant a write lock for the range of keys between [left_key, right_key]. 296 // returns: If the lock cannot be granted, return DB_LOCK_NOTGRANTED, and populate the 297 // given conflicts set with the txnids that hold conflicting locks in the range. 298 // If the locktree cannot create more locks, return TOKUDB_OUT_OF_LOCKS. 299 int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn); 300 301 // effect: populate the conflicts set with the txnids that would preventing 302 // the given txnid from getting a lock on [left_key, right_key] 303 void get_conflicts(bool is_write_request, TXNID txnid, 304 const DBT *left_key, const DBT *right_key, txnid_set *conflicts); 305 306 // effect: Release all of the lock ranges represented by the range buffer for a txnid. 307 void release_locks(TXNID txnid, const range_buffer *ranges); 308 309 // effect: Runs escalation on this locktree 310 void escalate(lt_escalate_cb after_escalate_callback, void *extra); 311 312 // returns: The userdata associated with this locktree, or null if it has not been set. 313 void *get_userdata(void) const; 314 315 void set_userdata(void *userdata); 316 317 locktree_manager *get_manager(void) const; 318 319 void set_comparator(const comparator &cmp); 320 321 int compare(const locktree *lt) const; 322 323 DICTIONARY_ID get_dict_id() const; 324 325 // Private info struct for storing pending lock request state. 326 // Only to be used by lock requests. We store it here as 327 // something less opaque than usual to strike a tradeoff between 328 // abstraction and code complexity. It is still fairly abstract 329 // since the lock_request object is opaque 330 struct lt_lock_request_info *get_lock_request_info(void); 331 332 private: 333 locktree_manager *m_mgr; 334 DICTIONARY_ID m_dict_id; 335 uint32_t m_reference_count; 336 337 // Since the memory referenced by this comparator is not owned by the 338 // locktree, the user must guarantee it will outlive the locktree. 339 // 340 // The ydb API accomplishes this by opening an ft_handle in the on_create 341 // callback, which will keep the underlying FT (and its descriptor) in memory 342 // for as long as the handle is open. The ft_handle is stored opaquely in the 343 // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra 344 comparator m_cmp; 345 346 concurrent_tree *m_rangetree; 347 348 void *m_userdata; 349 struct lt_lock_request_info m_lock_request_info; 350 351 // The following fields and members prefixed with "sto_" are for 352 // the single txnid optimization, intended to speed up the case 353 // when only one transaction is using the locktree. If we know 354 // the locktree has only one transaction, then acquiring locks 355 // takes O(1) work and releasing all locks takes O(1) work. 356 // 357 // How do we know that the locktree only has a single txnid? 358 // What do we do if it does? 359 // 360 // When a txn with txnid T requests a lock: 361 // - If the tree is empty, the optimization is possible. Set the single 362 // txnid to T, and insert the lock range into the buffer. 363 // - If the tree is not empty, check if the single txnid is T. If so, 364 // append the lock range to the buffer. Otherwise, migrate all of 365 // the locks in the buffer into the rangetree on behalf of txnid T, 366 // and invalid the single txnid. 367 // 368 // When a txn with txnid T releases its locks: 369 // - If the single txnid is valid, it must be for T. Destroy the buffer. 370 // - If it's not valid, release locks the normal way in the rangetree. 371 // 372 // To carry out the optimization we need to record a single txnid 373 // and a range buffer for each locktree, each protected by the root 374 // lock of the locktree's rangetree. The root lock for a rangetree 375 // is grabbed by preparing a locked keyrange on the rangetree. 376 TXNID m_sto_txnid; 377 range_buffer m_sto_buffer; 378 379 // The single txnid optimization speeds up the case when only one 380 // transaction is using the locktree. But it has the potential to 381 // hurt the case when more than one txnid exists. 382 // 383 // There are two things we need to do to make the optimization only 384 // optimize the case we care about, and not hurt the general case. 385 // 386 // Bound the worst-case latency for lock migration when the 387 // optimization stops working: 388 // - Idea: Stop the optimization and migrate immediate if we notice 389 // the single txnid has takes many locks in the range buffer. 390 // - Implementation: Enforce a max size on the single txnid range buffer. 391 // - Analysis: Choosing the perfect max value, M, is difficult to do 392 // without some feedback from the field. Intuition tells us that M should 393 // not be so small that the optimization is worthless, and it should not 394 // be so big that it's unreasonable to have to wait behind a thread doing 395 // the work of converting M buffer locks into rangetree locks. 396 // 397 // Prevent concurrent-transaction workloads from trying the optimization 398 // in vain: 399 // - Idea: Don't even bother trying the optimization if we think the 400 // system is in a concurrent-transaction state. 401 // - Implementation: Do something even simpler than detecting whether the 402 // system is in a concurent-transaction state. Just keep a "score" value 403 // and some threshold. If at any time the locktree is eligible for the 404 // optimization, only do it if the score is at this threshold. When you 405 // actually do the optimization but someone has to migrate locks in the buffer 406 // (expensive), then reset the score back to zero. Each time a txn 407 // releases locks, the score is incremented by 1. 408 // - Analysis: If you let the threshold be "C", then at most 1 / C txns will 409 // do the optimization in a concurrent-transaction system. Similarly, it 410 // takes at most C txns to start using the single txnid optimzation, which 411 // is good when the system transitions from multithreaded to single threaded. 412 // 413 // STO_BUFFER_MAX_SIZE: 414 // 415 // We choose the max value to be 1 million since most transactions are smaller 416 // than 1 million and we can create a rangetree of 1 million elements in 417 // less than a second. So we can be pretty confident that this threshold 418 // enables the optimization almost always, and prevents super pathological 419 // latency issues for the first lock taken by a second thread. 420 // 421 // STO_SCORE_THRESHOLD: 422 // 423 // A simple first guess at a good value for the score threshold is 100. 424 // By our analysis, we'd end up doing the optimization in vain for 425 // around 1% of all transactions, which seems reasonable. Further, 426 // if the system goes single threaded, it ought to be pretty quick 427 // for 100 transactions to go by, so we won't have to wait long before 428 // we start doing the single txind optimzation again. 429 static const int STO_BUFFER_MAX_SIZE = 50 * 1024; 430 static const int STO_SCORE_THRESHOLD = 100; 431 int m_sto_score; 432 433 // statistics about time spent ending the STO early 434 uint64_t m_sto_end_early_count; 435 tokutime_t m_sto_end_early_time; 436 437 // effect: begins the single txnid optimizaiton, setting m_sto_txnid 438 // to the given txnid. 439 // requires: m_sto_txnid is invalid 440 void sto_begin(TXNID txnid); 441 442 // effect: append a range to the sto buffer 443 // requires: m_sto_txnid is valid 444 void sto_append(const DBT *left_key, const DBT *right_key); 445 446 // effect: ends the single txnid optimization, releaseing any memory 447 // stored in the sto buffer, notifying the tracker, and 448 // invalidating m_sto_txnid. 449 // requires: m_sto_txnid is valid 450 void sto_end(void); 451 452 // params: prepared_lkr is a void * to a prepared locked keyrange. see below. 453 // effect: ends the single txnid optimization early, migrating buffer locks 454 // into the rangetree, calling sto_end(), and then setting the 455 // sto_score back to zero. 456 // requires: m_sto_txnid is valid 457 void sto_end_early(void *prepared_lkr); 458 void sto_end_early_no_accounting(void *prepared_lkr); 459 460 // params: prepared_lkr is a void * to a prepared locked keyrange. we can't use 461 // the real type because the compiler won't allow us to forward declare 462 // concurrent_tree::locked_keyrange without including concurrent_tree.h, 463 // which we cannot do here because it is a template implementation. 464 // requires: the prepared locked keyrange is for the locktree's rangetree 465 // requires: m_sto_txnid is valid 466 // effect: migrates each lock in the single txnid buffer into the locktree's 467 // rangetree, notifying the memory tracker as necessary. 468 void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr); 469 470 // effect: If m_sto_txnid is valid, then release the txnid's locks 471 // by ending the optimization. 472 // requires: If m_sto_txnid is valid, it is equal to the given txnid 473 // returns: True if locks were released for this txnid 474 bool sto_try_release(TXNID txnid); 475 476 // params: prepared_lkr is a void * to a prepared locked keyrange. see above. 477 // requires: the prepared locked keyrange is for the locktree's rangetree 478 // effect: If m_sto_txnid is valid and equal to the given txnid, then 479 // append a range onto the buffer. Otherwise, if m_sto_txnid is valid 480 // but not equal to this txnid, then migrate the buffer's locks 481 // into the rangetree and end the optimization, setting the score 482 // back to zero. 483 // returns: true if the lock was acquired for this txnid 484 bool sto_try_acquire(void *prepared_lkr, TXNID txnid, 485 const DBT *left_key, const DBT *right_key); 486 487 // Effect: 488 // Provides a hook for a helgrind suppression. 489 // Returns: 490 // true if m_sto_txnid is not TXNID_NONE 491 bool sto_txnid_is_valid_unsafe(void) const; 492 493 // Effect: 494 // Provides a hook for a helgrind suppression. 495 // Returns: 496 // m_sto_score 497 int sto_get_score_unsafe(void )const; 498 499 void remove_overlapping_locks_for_txnid(TXNID txnid, 500 const DBT *left_key, const DBT *right_key); 501 502 int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid, 503 const DBT *left_key, const DBT *right_key, 504 txnid_set *conflicts); 505 506 int acquire_lock(bool is_write_request, TXNID txnid, 507 const DBT *left_key, const DBT *right_key, 508 txnid_set *conflicts); 509 510 int try_acquire_lock(bool is_write_request, TXNID txnid, 511 const DBT *left_key, const DBT *right_key, 512 txnid_set *conflicts, bool big_txn); 513 514 515 friend class locktree_unit_test; 516 friend class manager_unit_test; 517 friend class lock_request_unit_test; 518 519 // engine status reaches into the locktree to read some stats 520 friend void locktree_manager::get_status(LTM_STATUS status); 521 }; 522 523 } /* namespace toku */ 524