1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of PerconaFT.
6 
7 
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9 
10     PerconaFT is free software: you can redistribute it and/or modify
11     it under the terms of the GNU General Public License, version 2,
12     as published by the Free Software Foundation.
13 
14     PerconaFT is distributed in the hope that it will be useful,
15     but WITHOUT ANY WARRANTY; without even the implied warranty of
16     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17     GNU General Public License for more details.
18 
19     You should have received a copy of the GNU General Public License
20     along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
21 
22 ----------------------------------------
23 
24     PerconaFT is free software: you can redistribute it and/or modify
25     it under the terms of the GNU Affero General Public License, version 3,
26     as published by the Free Software Foundation.
27 
28     PerconaFT is distributed in the hope that it will be useful,
29     but WITHOUT ANY WARRANTY; without even the implied warranty of
30     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
31     GNU Affero General Public License for more details.
32 
33     You should have received a copy of the GNU Affero General Public License
34     along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
35 
36 ----------------------------------------
37 
38    Licensed under the Apache License, Version 2.0 (the "License");
39    you may not use this file except in compliance with the License.
40    You may obtain a copy of the License at
41 
42        http://www.apache.org/licenses/LICENSE-2.0
43 
44    Unless required by applicable law or agreed to in writing, software
45    distributed under the License is distributed on an "AS IS" BASIS,
46    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
47    See the License for the specific language governing permissions and
48    limitations under the License.
49 ======= */
50 
51 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
52 
53 #include <toku_race_tools.h>
54 
55 // TODO: source location info might have to be pulled up one caller
56 // to be useful
mutex_lock(void)57 void treenode::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
58 
mutex_unlock(void)59 void treenode::mutex_unlock(void) {
60     toku_mutex_unlock(&m_mutex);
61 }
62 
init(const comparator * cmp)63 void treenode::init(const comparator *cmp) {
64     m_txnid = TXNID_NONE;
65     m_is_root = false;
66     m_is_empty = true;
67     m_cmp = cmp;
68     // use an adaptive mutex at each node since we expect the time the
69     // lock is held to be relatively short compared to a context switch.
70     // indeed, this improves performance at high thread counts considerably.
71     memset(&m_mutex, 0, sizeof(toku_mutex_t));
72     toku_pthread_mutexattr_t attr;
73     toku_mutexattr_init(&attr);
74     toku_mutexattr_settype(&attr, TOKU_MUTEX_ADAPTIVE);
75     toku_mutex_init(*treenode_mutex_key, &m_mutex, &attr);
76     toku_mutexattr_destroy(&attr);
77     m_left_child.set(nullptr);
78     m_right_child.set(nullptr);
79 }
80 
create_root(const comparator * cmp)81 void treenode::create_root(const comparator *cmp) {
82     init(cmp);
83     m_is_root = true;
84 }
85 
destroy_root(void)86 void treenode::destroy_root(void) {
87     invariant(is_root());
88     invariant(is_empty());
89     toku_mutex_destroy(&m_mutex);
90     m_cmp = nullptr;
91 }
92 
set_range_and_txnid(const keyrange & range,TXNID txnid)93 void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid) {
94     // allocates a new copy of the range for this node
95     m_range.create_copy(range);
96     m_txnid = txnid;
97     m_is_empty = false;
98 }
99 
is_root(void)100 bool treenode::is_root(void) {
101     return m_is_root;
102 }
103 
is_empty(void)104 bool treenode::is_empty(void) {
105     return m_is_empty;
106 }
107 
range_overlaps(const keyrange & range)108 bool treenode::range_overlaps(const keyrange &range) {
109     return m_range.overlaps(*m_cmp, range);
110 }
111 
alloc(const comparator * cmp,const keyrange & range,TXNID txnid)112 treenode *treenode::alloc(const comparator *cmp, const keyrange &range, TXNID txnid) {
113     treenode *XCALLOC(node);
114     node->init(cmp);
115     node->set_range_and_txnid(range, txnid);
116     return node;
117 }
118 
swap_in_place(treenode * node1,treenode * node2)119 void treenode::swap_in_place(treenode *node1, treenode *node2) {
120     keyrange tmp_range = node1->m_range;
121     TXNID tmp_txnid = node1->m_txnid;
122     node1->m_range = node2->m_range;
123     node1->m_txnid = node2->m_txnid;
124     node2->m_range = tmp_range;
125     node2->m_txnid = tmp_txnid;
126 }
127 
free(treenode * node)128 void treenode::free(treenode *node) {
129     // destroy the range, freeing any copied keys
130     node->m_range.destroy();
131 
132     // the root is simply marked as empty.
133     if (node->is_root()) {
134         toku_mutex_assert_locked(&node->m_mutex);
135         node->m_is_empty = true;
136     } else {
137         toku_mutex_assert_unlocked(&node->m_mutex);
138         toku_mutex_destroy(&node->m_mutex);
139         toku_free(node);
140     }
141 }
142 
get_depth_estimate(void) const143 uint32_t treenode::get_depth_estimate(void) const {
144     const uint32_t left_est = m_left_child.depth_est;
145     const uint32_t right_est = m_right_child.depth_est;
146     return (left_est > right_est ? left_est : right_est) + 1;
147 }
148 
find_node_with_overlapping_child(const keyrange & range,const keyrange::comparison * cmp_hint)149 treenode *treenode::find_node_with_overlapping_child(const keyrange &range,
150         const keyrange::comparison *cmp_hint) {
151 
152     // determine which child to look at based on a comparison. if we were
153     // given a comparison hint, use that. otherwise, compare them now.
154     keyrange::comparison c = cmp_hint ? *cmp_hint : range.compare(*m_cmp, m_range);
155 
156     treenode *child;
157     if (c == keyrange::comparison::LESS_THAN) {
158         child = lock_and_rebalance_left();
159     } else {
160         // The caller (locked_keyrange::acquire) handles the case where
161         // the root of the locked_keyrange is the node that overlaps.
162         // range is guaranteed not to overlap this node.
163         invariant(c == keyrange::comparison::GREATER_THAN);
164         child = lock_and_rebalance_right();
165     }
166 
167     // if the search would lead us to an empty subtree (child == nullptr),
168     // or the child overlaps, then we know this node is the parent we want.
169     // otherwise we need to recur into that child.
170     if (child == nullptr) {
171         return this;
172     } else {
173         c = range.compare(*m_cmp, child->m_range);
174         if (c == keyrange::comparison::EQUALS || c == keyrange::comparison::OVERLAPS) {
175             child->mutex_unlock();
176             return this;
177         } else {
178             // unlock this node before recurring into the locked child,
179             // passing in a comparison hint since we just comapred range
180             // to the child's range.
181             mutex_unlock();
182             return child->find_node_with_overlapping_child(range, &c);
183         }
184     }
185 }
186 
187 template <class F>
traverse_overlaps(const keyrange & range,F * function)188 void treenode::traverse_overlaps(const keyrange &range, F *function) {
189     keyrange::comparison c = range.compare(*m_cmp, m_range);
190     if (c == keyrange::comparison::EQUALS) {
191         // Doesn't matter if fn wants to keep going, there
192         // is nothing left, so return.
193         function->fn(m_range, m_txnid);
194         return;
195     }
196 
197     treenode *left = m_left_child.get_locked();
198     if (left) {
199         if (c != keyrange::comparison::GREATER_THAN) {
200             // Target range is less than this node, or it overlaps this
201             // node.  There may be something on the left.
202             left->traverse_overlaps(range, function);
203         }
204         left->mutex_unlock();
205     }
206 
207     if (c == keyrange::comparison::OVERLAPS) {
208         bool keep_going = function->fn(m_range, m_txnid);
209         if (!keep_going) {
210             return;
211         }
212     }
213 
214     treenode *right = m_right_child.get_locked();
215     if (right) {
216         if (c != keyrange::comparison::LESS_THAN) {
217             // Target range is greater than this node, or it overlaps this
218             // node.  There may be something on the right.
219             right->traverse_overlaps(range, function);
220         }
221         right->mutex_unlock();
222     }
223 }
224 
insert(const keyrange & range,TXNID txnid)225 void treenode::insert(const keyrange &range, TXNID txnid) {
226     // choose a child to check. if that child is null, then insert the new node there.
227     // otherwise recur down that child's subtree
228     keyrange::comparison c = range.compare(*m_cmp, m_range);
229     if (c == keyrange::comparison::LESS_THAN) {
230         treenode *left_child = lock_and_rebalance_left();
231         if (left_child == nullptr) {
232             left_child = treenode::alloc(m_cmp, range, txnid);
233             m_left_child.set(left_child);
234         } else {
235             left_child->insert(range, txnid);
236             left_child->mutex_unlock();
237         }
238     } else {
239         invariant(c == keyrange::comparison::GREATER_THAN);
240         treenode *right_child = lock_and_rebalance_right();
241         if (right_child == nullptr) {
242             right_child = treenode::alloc(m_cmp, range, txnid);
243             m_right_child.set(right_child);
244         } else {
245             right_child->insert(range, txnid);
246             right_child->mutex_unlock();
247         }
248     }
249 }
250 
find_child_at_extreme(int direction,treenode ** parent)251 treenode *treenode::find_child_at_extreme(int direction, treenode **parent) {
252     treenode *child = direction > 0 ?
253         m_right_child.get_locked() : m_left_child.get_locked();
254 
255     if (child) {
256         *parent = this;
257         treenode *child_extreme = child->find_child_at_extreme(direction, parent);
258         child->mutex_unlock();
259         return child_extreme;
260     } else {
261         return this;
262     }
263 }
264 
find_leftmost_child(treenode ** parent)265 treenode *treenode::find_leftmost_child(treenode **parent) {
266     return find_child_at_extreme(-1, parent);
267 }
268 
find_rightmost_child(treenode ** parent)269 treenode *treenode::find_rightmost_child(treenode **parent) {
270     return find_child_at_extreme(1, parent);
271 }
272 
remove_root_of_subtree()273 treenode *treenode::remove_root_of_subtree() {
274     // if this node has no children, just free it and return null
275     if (m_left_child.ptr == nullptr && m_right_child.ptr == nullptr) {
276         // treenode::free requires that non-root nodes are unlocked
277         if (!is_root()) {
278             mutex_unlock();
279         }
280         treenode::free(this);
281         return nullptr;
282     }
283 
284     // we have a child, so get either the in-order successor or
285     // predecessor of this node to be our replacement.
286     // replacement_parent is updated by the find functions as
287     // they recur down the tree, so initialize it to this.
288     treenode *child, *replacement;
289     treenode *replacement_parent = this;
290     if (m_left_child.ptr != nullptr) {
291         child = m_left_child.get_locked();
292         replacement = child->find_rightmost_child(&replacement_parent);
293         invariant(replacement == child || replacement_parent != this);
294 
295         // detach the replacement from its parent
296         if (replacement_parent == this) {
297             m_left_child = replacement->m_left_child;
298         } else {
299             replacement_parent->m_right_child = replacement->m_left_child;
300         }
301     } else {
302         child = m_right_child.get_locked();
303         replacement = child->find_leftmost_child(&replacement_parent);
304         invariant(replacement == child || replacement_parent != this);
305 
306         // detach the replacement from its parent
307         if (replacement_parent == this) {
308             m_right_child = replacement->m_right_child;
309         } else {
310             replacement_parent->m_left_child = replacement->m_right_child;
311         }
312     }
313     child->mutex_unlock();
314 
315     // swap in place with the detached replacement, then destroy it
316     treenode::swap_in_place(replacement, this);
317     treenode::free(replacement);
318 
319     return this;
320 }
321 
recursive_remove(void)322 void treenode::recursive_remove(void) {
323     treenode *left = m_left_child.ptr;
324     if (left) {
325         left->recursive_remove();
326     }
327     m_left_child.set(nullptr);
328 
329     treenode *right = m_right_child.ptr;
330     if (right) {
331         right->recursive_remove();
332     }
333     m_right_child.set(nullptr);
334 
335     // we do not take locks on the way down, so we know non-root nodes
336     // are unlocked here and the caller is required to pass a locked
337     // root, so this free is correct.
338     treenode::free(this);
339 }
340 
remove(const keyrange & range)341 treenode *treenode::remove(const keyrange &range) {
342     treenode *child;
343     // if the range is equal to this node's range, then just remove
344     // the root of this subtree. otherwise search down the tree
345     // in either the left or right children.
346     keyrange::comparison c = range.compare(*m_cmp, m_range);
347     switch (c) {
348     case keyrange::comparison::EQUALS:
349         return remove_root_of_subtree();
350     case keyrange::comparison::LESS_THAN:
351         child = m_left_child.get_locked();
352         invariant_notnull(child);
353         child = child->remove(range);
354 
355         // unlock the child if there still is one.
356         // regardless, set the right child pointer
357         if (child) {
358             child->mutex_unlock();
359         }
360         m_left_child.set(child);
361         break;
362     case keyrange::comparison::GREATER_THAN:
363         child = m_right_child.get_locked();
364         invariant_notnull(child);
365         child = child->remove(range);
366 
367         // unlock the child if there still is one.
368         // regardless, set the right child pointer
369         if (child) {
370             child->mutex_unlock();
371         }
372         m_right_child.set(child);
373         break;
374     case keyrange::comparison::OVERLAPS:
375         // shouldn't be overlapping, since the tree is
376         // non-overlapping and this range must exist
377         abort();
378     }
379 
380     return this;
381 }
382 
left_imbalanced(int threshold) const383 bool treenode::left_imbalanced(int threshold) const {
384     uint32_t left_depth = m_left_child.depth_est;
385     uint32_t right_depth = m_right_child.depth_est;
386     return m_left_child.ptr != nullptr && left_depth > threshold + right_depth;
387 }
388 
right_imbalanced(int threshold) const389 bool treenode::right_imbalanced(int threshold) const {
390     uint32_t left_depth = m_left_child.depth_est;
391     uint32_t right_depth = m_right_child.depth_est;
392     return m_right_child.ptr != nullptr && right_depth > threshold + left_depth;
393 }
394 
395 // effect: rebalances the subtree rooted at this node
396 //         using AVL style O(1) rotations. unlocks this
397 //         node if it is not the new root of the subtree.
398 // requires: node is locked by this thread, children are not
399 // returns: locked root node of the rebalanced tree
maybe_rebalance(void)400 treenode *treenode::maybe_rebalance(void) {
401     // if we end up not rotating at all, the new root is this
402     treenode *new_root = this;
403     treenode *child = nullptr;
404 
405     if (left_imbalanced(IMBALANCE_THRESHOLD)) {
406         child = m_left_child.get_locked();
407         if (child->right_imbalanced(0)) {
408             treenode *grandchild = child->m_right_child.get_locked();
409 
410             child->m_right_child = grandchild->m_left_child;
411             grandchild->m_left_child.set(child);
412 
413             m_left_child = grandchild->m_right_child;
414             grandchild->m_right_child.set(this);
415 
416             new_root = grandchild;
417         } else {
418             m_left_child = child->m_right_child;
419             child->m_right_child.set(this);
420             new_root = child;
421         }
422     } else if (right_imbalanced(IMBALANCE_THRESHOLD)) {
423         child = m_right_child.get_locked();
424         if (child->left_imbalanced(0)) {
425             treenode *grandchild = child->m_left_child.get_locked();
426 
427             child->m_left_child = grandchild->m_right_child;
428             grandchild->m_right_child.set(child);
429 
430             m_right_child = grandchild->m_left_child;
431             grandchild->m_left_child.set(this);
432 
433             new_root = grandchild;
434         } else {
435             m_right_child = child->m_left_child;
436             child->m_left_child.set(this);
437             new_root = child;
438         }
439     }
440 
441     // up to three nodes may be locked.
442     // - this
443     // - child
444     // - grandchild (but if it is locked, its the new root)
445     //
446     // one of them is the new root. we unlock everything except the new root.
447     if (child && child != new_root) {
448         TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&child->m_mutex);
449         child->mutex_unlock();
450     }
451     if (this != new_root) {
452         TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&m_mutex);
453         mutex_unlock();
454     }
455     TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&new_root->m_mutex);
456     return new_root;
457 }
458 
459 
lock_and_rebalance_left(void)460 treenode *treenode::lock_and_rebalance_left(void) {
461     treenode *child = m_left_child.get_locked();
462     if (child) {
463         treenode *new_root = child->maybe_rebalance();
464         m_left_child.set(new_root);
465         child = new_root;
466     }
467     return child;
468 }
469 
lock_and_rebalance_right(void)470 treenode *treenode::lock_and_rebalance_right(void) {
471     treenode *child = m_right_child.get_locked();
472     if (child) {
473         treenode *new_root = child->maybe_rebalance();
474         m_right_child.set(new_root);
475         child = new_root;
476     }
477     return child;
478 }
479 
set(treenode * node)480 void treenode::child_ptr::set(treenode *node) {
481     ptr = node;
482     depth_est = ptr ? ptr->get_depth_estimate() : 0;
483 }
484 
get_locked(void)485 treenode *treenode::child_ptr::get_locked(void) {
486     if (ptr) {
487         ptr->mutex_lock();
488         depth_est = ptr->get_depth_estimate();
489     }
490     return ptr;
491 }
492