1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of PerconaFT.
6
7
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 PerconaFT is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 PerconaFT is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
21
22 ----------------------------------------
23
24 PerconaFT is free software: you can redistribute it and/or modify
25 it under the terms of the GNU Affero General Public License, version 3,
26 as published by the Free Software Foundation.
27
28 PerconaFT is distributed in the hope that it will be useful,
29 but WITHOUT ANY WARRANTY; without even the implied warranty of
30 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 GNU Affero General Public License for more details.
32
33 You should have received a copy of the GNU Affero General Public License
34 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
35
36 ----------------------------------------
37
38 Licensed under the Apache License, Version 2.0 (the "License");
39 you may not use this file except in compliance with the License.
40 You may obtain a copy of the License at
41
42 http://www.apache.org/licenses/LICENSE-2.0
43
44 Unless required by applicable law or agreed to in writing, software
45 distributed under the License is distributed on an "AS IS" BASIS,
46 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
47 See the License for the specific language governing permissions and
48 limitations under the License.
49 ======= */
50
51 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
52
53 #include <toku_race_tools.h>
54
55 // TODO: source location info might have to be pulled up one caller
56 // to be useful
mutex_lock(void)57 void treenode::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
58
mutex_unlock(void)59 void treenode::mutex_unlock(void) {
60 toku_mutex_unlock(&m_mutex);
61 }
62
init(const comparator * cmp)63 void treenode::init(const comparator *cmp) {
64 m_txnid = TXNID_NONE;
65 m_is_root = false;
66 m_is_empty = true;
67 m_cmp = cmp;
68 // use an adaptive mutex at each node since we expect the time the
69 // lock is held to be relatively short compared to a context switch.
70 // indeed, this improves performance at high thread counts considerably.
71 memset(&m_mutex, 0, sizeof(toku_mutex_t));
72 toku_pthread_mutexattr_t attr;
73 toku_mutexattr_init(&attr);
74 toku_mutexattr_settype(&attr, TOKU_MUTEX_ADAPTIVE);
75 toku_mutex_init(*treenode_mutex_key, &m_mutex, &attr);
76 toku_mutexattr_destroy(&attr);
77 m_left_child.set(nullptr);
78 m_right_child.set(nullptr);
79 }
80
create_root(const comparator * cmp)81 void treenode::create_root(const comparator *cmp) {
82 init(cmp);
83 m_is_root = true;
84 }
85
destroy_root(void)86 void treenode::destroy_root(void) {
87 invariant(is_root());
88 invariant(is_empty());
89 toku_mutex_destroy(&m_mutex);
90 m_cmp = nullptr;
91 }
92
set_range_and_txnid(const keyrange & range,TXNID txnid)93 void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid) {
94 // allocates a new copy of the range for this node
95 m_range.create_copy(range);
96 m_txnid = txnid;
97 m_is_empty = false;
98 }
99
is_root(void)100 bool treenode::is_root(void) {
101 return m_is_root;
102 }
103
is_empty(void)104 bool treenode::is_empty(void) {
105 return m_is_empty;
106 }
107
range_overlaps(const keyrange & range)108 bool treenode::range_overlaps(const keyrange &range) {
109 return m_range.overlaps(*m_cmp, range);
110 }
111
alloc(const comparator * cmp,const keyrange & range,TXNID txnid)112 treenode *treenode::alloc(const comparator *cmp, const keyrange &range, TXNID txnid) {
113 treenode *XCALLOC(node);
114 node->init(cmp);
115 node->set_range_and_txnid(range, txnid);
116 return node;
117 }
118
swap_in_place(treenode * node1,treenode * node2)119 void treenode::swap_in_place(treenode *node1, treenode *node2) {
120 keyrange tmp_range = node1->m_range;
121 TXNID tmp_txnid = node1->m_txnid;
122 node1->m_range = node2->m_range;
123 node1->m_txnid = node2->m_txnid;
124 node2->m_range = tmp_range;
125 node2->m_txnid = tmp_txnid;
126 }
127
free(treenode * node)128 void treenode::free(treenode *node) {
129 // destroy the range, freeing any copied keys
130 node->m_range.destroy();
131
132 // the root is simply marked as empty.
133 if (node->is_root()) {
134 toku_mutex_assert_locked(&node->m_mutex);
135 node->m_is_empty = true;
136 } else {
137 toku_mutex_assert_unlocked(&node->m_mutex);
138 toku_mutex_destroy(&node->m_mutex);
139 toku_free(node);
140 }
141 }
142
get_depth_estimate(void) const143 uint32_t treenode::get_depth_estimate(void) const {
144 const uint32_t left_est = m_left_child.depth_est;
145 const uint32_t right_est = m_right_child.depth_est;
146 return (left_est > right_est ? left_est : right_est) + 1;
147 }
148
find_node_with_overlapping_child(const keyrange & range,const keyrange::comparison * cmp_hint)149 treenode *treenode::find_node_with_overlapping_child(const keyrange &range,
150 const keyrange::comparison *cmp_hint) {
151
152 // determine which child to look at based on a comparison. if we were
153 // given a comparison hint, use that. otherwise, compare them now.
154 keyrange::comparison c = cmp_hint ? *cmp_hint : range.compare(*m_cmp, m_range);
155
156 treenode *child;
157 if (c == keyrange::comparison::LESS_THAN) {
158 child = lock_and_rebalance_left();
159 } else {
160 // The caller (locked_keyrange::acquire) handles the case where
161 // the root of the locked_keyrange is the node that overlaps.
162 // range is guaranteed not to overlap this node.
163 invariant(c == keyrange::comparison::GREATER_THAN);
164 child = lock_and_rebalance_right();
165 }
166
167 // if the search would lead us to an empty subtree (child == nullptr),
168 // or the child overlaps, then we know this node is the parent we want.
169 // otherwise we need to recur into that child.
170 if (child == nullptr) {
171 return this;
172 } else {
173 c = range.compare(*m_cmp, child->m_range);
174 if (c == keyrange::comparison::EQUALS || c == keyrange::comparison::OVERLAPS) {
175 child->mutex_unlock();
176 return this;
177 } else {
178 // unlock this node before recurring into the locked child,
179 // passing in a comparison hint since we just comapred range
180 // to the child's range.
181 mutex_unlock();
182 return child->find_node_with_overlapping_child(range, &c);
183 }
184 }
185 }
186
187 template <class F>
traverse_overlaps(const keyrange & range,F * function)188 void treenode::traverse_overlaps(const keyrange &range, F *function) {
189 keyrange::comparison c = range.compare(*m_cmp, m_range);
190 if (c == keyrange::comparison::EQUALS) {
191 // Doesn't matter if fn wants to keep going, there
192 // is nothing left, so return.
193 function->fn(m_range, m_txnid);
194 return;
195 }
196
197 treenode *left = m_left_child.get_locked();
198 if (left) {
199 if (c != keyrange::comparison::GREATER_THAN) {
200 // Target range is less than this node, or it overlaps this
201 // node. There may be something on the left.
202 left->traverse_overlaps(range, function);
203 }
204 left->mutex_unlock();
205 }
206
207 if (c == keyrange::comparison::OVERLAPS) {
208 bool keep_going = function->fn(m_range, m_txnid);
209 if (!keep_going) {
210 return;
211 }
212 }
213
214 treenode *right = m_right_child.get_locked();
215 if (right) {
216 if (c != keyrange::comparison::LESS_THAN) {
217 // Target range is greater than this node, or it overlaps this
218 // node. There may be something on the right.
219 right->traverse_overlaps(range, function);
220 }
221 right->mutex_unlock();
222 }
223 }
224
insert(const keyrange & range,TXNID txnid)225 void treenode::insert(const keyrange &range, TXNID txnid) {
226 // choose a child to check. if that child is null, then insert the new node there.
227 // otherwise recur down that child's subtree
228 keyrange::comparison c = range.compare(*m_cmp, m_range);
229 if (c == keyrange::comparison::LESS_THAN) {
230 treenode *left_child = lock_and_rebalance_left();
231 if (left_child == nullptr) {
232 left_child = treenode::alloc(m_cmp, range, txnid);
233 m_left_child.set(left_child);
234 } else {
235 left_child->insert(range, txnid);
236 left_child->mutex_unlock();
237 }
238 } else {
239 invariant(c == keyrange::comparison::GREATER_THAN);
240 treenode *right_child = lock_and_rebalance_right();
241 if (right_child == nullptr) {
242 right_child = treenode::alloc(m_cmp, range, txnid);
243 m_right_child.set(right_child);
244 } else {
245 right_child->insert(range, txnid);
246 right_child->mutex_unlock();
247 }
248 }
249 }
250
find_child_at_extreme(int direction,treenode ** parent)251 treenode *treenode::find_child_at_extreme(int direction, treenode **parent) {
252 treenode *child = direction > 0 ?
253 m_right_child.get_locked() : m_left_child.get_locked();
254
255 if (child) {
256 *parent = this;
257 treenode *child_extreme = child->find_child_at_extreme(direction, parent);
258 child->mutex_unlock();
259 return child_extreme;
260 } else {
261 return this;
262 }
263 }
264
find_leftmost_child(treenode ** parent)265 treenode *treenode::find_leftmost_child(treenode **parent) {
266 return find_child_at_extreme(-1, parent);
267 }
268
find_rightmost_child(treenode ** parent)269 treenode *treenode::find_rightmost_child(treenode **parent) {
270 return find_child_at_extreme(1, parent);
271 }
272
remove_root_of_subtree()273 treenode *treenode::remove_root_of_subtree() {
274 // if this node has no children, just free it and return null
275 if (m_left_child.ptr == nullptr && m_right_child.ptr == nullptr) {
276 // treenode::free requires that non-root nodes are unlocked
277 if (!is_root()) {
278 mutex_unlock();
279 }
280 treenode::free(this);
281 return nullptr;
282 }
283
284 // we have a child, so get either the in-order successor or
285 // predecessor of this node to be our replacement.
286 // replacement_parent is updated by the find functions as
287 // they recur down the tree, so initialize it to this.
288 treenode *child, *replacement;
289 treenode *replacement_parent = this;
290 if (m_left_child.ptr != nullptr) {
291 child = m_left_child.get_locked();
292 replacement = child->find_rightmost_child(&replacement_parent);
293 invariant(replacement == child || replacement_parent != this);
294
295 // detach the replacement from its parent
296 if (replacement_parent == this) {
297 m_left_child = replacement->m_left_child;
298 } else {
299 replacement_parent->m_right_child = replacement->m_left_child;
300 }
301 } else {
302 child = m_right_child.get_locked();
303 replacement = child->find_leftmost_child(&replacement_parent);
304 invariant(replacement == child || replacement_parent != this);
305
306 // detach the replacement from its parent
307 if (replacement_parent == this) {
308 m_right_child = replacement->m_right_child;
309 } else {
310 replacement_parent->m_left_child = replacement->m_right_child;
311 }
312 }
313 child->mutex_unlock();
314
315 // swap in place with the detached replacement, then destroy it
316 treenode::swap_in_place(replacement, this);
317 treenode::free(replacement);
318
319 return this;
320 }
321
recursive_remove(void)322 void treenode::recursive_remove(void) {
323 treenode *left = m_left_child.ptr;
324 if (left) {
325 left->recursive_remove();
326 }
327 m_left_child.set(nullptr);
328
329 treenode *right = m_right_child.ptr;
330 if (right) {
331 right->recursive_remove();
332 }
333 m_right_child.set(nullptr);
334
335 // we do not take locks on the way down, so we know non-root nodes
336 // are unlocked here and the caller is required to pass a locked
337 // root, so this free is correct.
338 treenode::free(this);
339 }
340
remove(const keyrange & range)341 treenode *treenode::remove(const keyrange &range) {
342 treenode *child;
343 // if the range is equal to this node's range, then just remove
344 // the root of this subtree. otherwise search down the tree
345 // in either the left or right children.
346 keyrange::comparison c = range.compare(*m_cmp, m_range);
347 switch (c) {
348 case keyrange::comparison::EQUALS:
349 return remove_root_of_subtree();
350 case keyrange::comparison::LESS_THAN:
351 child = m_left_child.get_locked();
352 invariant_notnull(child);
353 child = child->remove(range);
354
355 // unlock the child if there still is one.
356 // regardless, set the right child pointer
357 if (child) {
358 child->mutex_unlock();
359 }
360 m_left_child.set(child);
361 break;
362 case keyrange::comparison::GREATER_THAN:
363 child = m_right_child.get_locked();
364 invariant_notnull(child);
365 child = child->remove(range);
366
367 // unlock the child if there still is one.
368 // regardless, set the right child pointer
369 if (child) {
370 child->mutex_unlock();
371 }
372 m_right_child.set(child);
373 break;
374 case keyrange::comparison::OVERLAPS:
375 // shouldn't be overlapping, since the tree is
376 // non-overlapping and this range must exist
377 abort();
378 }
379
380 return this;
381 }
382
left_imbalanced(int threshold) const383 bool treenode::left_imbalanced(int threshold) const {
384 uint32_t left_depth = m_left_child.depth_est;
385 uint32_t right_depth = m_right_child.depth_est;
386 return m_left_child.ptr != nullptr && left_depth > threshold + right_depth;
387 }
388
right_imbalanced(int threshold) const389 bool treenode::right_imbalanced(int threshold) const {
390 uint32_t left_depth = m_left_child.depth_est;
391 uint32_t right_depth = m_right_child.depth_est;
392 return m_right_child.ptr != nullptr && right_depth > threshold + left_depth;
393 }
394
395 // effect: rebalances the subtree rooted at this node
396 // using AVL style O(1) rotations. unlocks this
397 // node if it is not the new root of the subtree.
398 // requires: node is locked by this thread, children are not
399 // returns: locked root node of the rebalanced tree
maybe_rebalance(void)400 treenode *treenode::maybe_rebalance(void) {
401 // if we end up not rotating at all, the new root is this
402 treenode *new_root = this;
403 treenode *child = nullptr;
404
405 if (left_imbalanced(IMBALANCE_THRESHOLD)) {
406 child = m_left_child.get_locked();
407 if (child->right_imbalanced(0)) {
408 treenode *grandchild = child->m_right_child.get_locked();
409
410 child->m_right_child = grandchild->m_left_child;
411 grandchild->m_left_child.set(child);
412
413 m_left_child = grandchild->m_right_child;
414 grandchild->m_right_child.set(this);
415
416 new_root = grandchild;
417 } else {
418 m_left_child = child->m_right_child;
419 child->m_right_child.set(this);
420 new_root = child;
421 }
422 } else if (right_imbalanced(IMBALANCE_THRESHOLD)) {
423 child = m_right_child.get_locked();
424 if (child->left_imbalanced(0)) {
425 treenode *grandchild = child->m_left_child.get_locked();
426
427 child->m_left_child = grandchild->m_right_child;
428 grandchild->m_right_child.set(child);
429
430 m_right_child = grandchild->m_left_child;
431 grandchild->m_left_child.set(this);
432
433 new_root = grandchild;
434 } else {
435 m_right_child = child->m_left_child;
436 child->m_left_child.set(this);
437 new_root = child;
438 }
439 }
440
441 // up to three nodes may be locked.
442 // - this
443 // - child
444 // - grandchild (but if it is locked, its the new root)
445 //
446 // one of them is the new root. we unlock everything except the new root.
447 if (child && child != new_root) {
448 TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&child->m_mutex);
449 child->mutex_unlock();
450 }
451 if (this != new_root) {
452 TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&m_mutex);
453 mutex_unlock();
454 }
455 TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&new_root->m_mutex);
456 return new_root;
457 }
458
459
lock_and_rebalance_left(void)460 treenode *treenode::lock_and_rebalance_left(void) {
461 treenode *child = m_left_child.get_locked();
462 if (child) {
463 treenode *new_root = child->maybe_rebalance();
464 m_left_child.set(new_root);
465 child = new_root;
466 }
467 return child;
468 }
469
lock_and_rebalance_right(void)470 treenode *treenode::lock_and_rebalance_right(void) {
471 treenode *child = m_right_child.get_locked();
472 if (child) {
473 treenode *new_root = child->maybe_rebalance();
474 m_right_child.set(new_root);
475 child = new_root;
476 }
477 return child;
478 }
479
set(treenode * node)480 void treenode::child_ptr::set(treenode *node) {
481 ptr = node;
482 depth_est = ptr ? ptr->get_depth_estimate() : 0;
483 }
484
get_locked(void)485 treenode *treenode::child_ptr::get_locked(void) {
486 if (ptr) {
487 ptr->mutex_lock();
488 depth_est = ptr->get_depth_estimate();
489 }
490 return ptr;
491 }
492