1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of PerconaFT.
6 
7 
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9 
10     PerconaFT is free software: you can redistribute it and/or modify
11     it under the terms of the GNU General Public License, version 2,
12     as published by the Free Software Foundation.
13 
14     PerconaFT is distributed in the hope that it will be useful,
15     but WITHOUT ANY WARRANTY; without even the implied warranty of
16     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17     GNU General Public License for more details.
18 
19     You should have received a copy of the GNU General Public License
20     along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
21 
22 ----------------------------------------
23 
24     PerconaFT is free software: you can redistribute it and/or modify
25     it under the terms of the GNU Affero General Public License, version 3,
26     as published by the Free Software Foundation.
27 
28     PerconaFT is distributed in the hope that it will be useful,
29     but WITHOUT ANY WARRANTY; without even the implied warranty of
30     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
31     GNU Affero General Public License for more details.
32 
33     You should have received a copy of the GNU Affero General Public License
34     along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
35 ======= */
36 
37 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
38 
39 #include "ft/ft.h"
40 #include "ft/ft-cachetable-wrappers.h"
41 #include "ft/ft-flusher.h"
42 #include "ft/ft-flusher-internal.h"
43 #include "ft/ft-internal.h"
44 #include "ft/node.h"
45 #include "portability/toku_atomic.h"
46 #include "util/context.h"
47 #include "util/status.h"
48 
49 // Member Descirption:
50 // 1. highest_pivot_key - this is the key that corresponds to the
51 // most recently flushed leaf entry.
52 // 2. max_current_key - this is the pivot/key that we inherit as
53 // we descend down the tree.  We use this to set the highest_pivot_key.
54 // 3. sub_tree_size - this is the percentage of the entire tree that our
55 // current position (in a sub-tree) encompasses.
56 // 4. percentage_done - this is the percentage of leaf nodes that have
57 // been flushed into.
58 // 5. rightmost_leaf_seen - this is a boolean we use to determine if
59 // if we have flushed to every leaf node.
60 struct hot_flusher_extra {
61     DBT highest_pivot_key;
62     DBT max_current_key;
63     float sub_tree_size;
64     float percentage_done;
65     bool rightmost_leaf_seen;
66 };
67 
68 void
toku_ft_hot_get_status(FT_HOT_STATUS s)69 toku_ft_hot_get_status(FT_HOT_STATUS s) {
70     hot_status.init();
71     *s = hot_status;
72 }
73 
74 // Copies the max current key to the highest pivot key seen.
75 static void
hot_set_highest_key(struct hot_flusher_extra * flusher)76 hot_set_highest_key(struct hot_flusher_extra *flusher)
77 {
78     // The max current key will be NULL if we are traversing in the
79     // rightmost subtree of a given parent.  As such, we don't want to
80     // allocate memory for this case.
81     toku_destroy_dbt(&flusher->highest_pivot_key);
82     if (flusher->max_current_key.data != NULL) {
83         // Otherwise, let's copy all the contents from one key to the other.
84         toku_clone_dbt(&flusher->highest_pivot_key, flusher->max_current_key);
85     }
86 }
87 
88 static void
hot_set_start_key(struct hot_flusher_extra * flusher,const DBT * start)89 hot_set_start_key(struct hot_flusher_extra *flusher, const DBT* start)
90 {
91     toku_destroy_dbt(&flusher->highest_pivot_key);
92     if (start != NULL) {
93         // Otherwise, let's copy all the contents from one key to the other.
94         toku_clone_dbt(&flusher->highest_pivot_key, *start);
95     }
96 }
97 
98 static int
hot_just_pick_child(FT ft,FTNODE parent,struct hot_flusher_extra * flusher)99 hot_just_pick_child(FT ft,
100                     FTNODE parent,
101                     struct hot_flusher_extra *flusher)
102 {
103     int childnum = 0;
104 
105     // Search through Parents pivots, see which one is greater than
106     // the highest_pivot_key seen so far.
107     if (flusher->highest_pivot_key.data == NULL)
108     {
109         // Special case of the first child of the root node.
110         // Also known as, NEGATIVE INFINITY....
111         childnum = 0;
112     } else {
113         // Find the pivot boundary.
114         childnum = toku_ftnode_hot_next_child(parent, &flusher->highest_pivot_key, ft->cmp);
115     }
116 
117     return childnum;
118 }
119 
120 static void
hot_update_flusher_keys(FTNODE parent,int childnum,struct hot_flusher_extra * flusher)121 hot_update_flusher_keys(FTNODE parent,
122                         int childnum,
123                         struct hot_flusher_extra *flusher)
124 {
125     // Update maximum current key if the child is NOT the rightmost
126     // child node.
127     if (childnum < (parent->n_children - 1)) {
128         toku_destroy_dbt(&flusher->max_current_key);
129         toku_clone_dbt(&flusher->max_current_key, parent->pivotkeys.get_pivot(childnum));
130     }
131 }
132 
133 // Picks which child toku_ft_flush_some_child will use for flushing and
134 // recursion.
135 static int
hot_pick_child(FT ft,FTNODE parent,void * extra)136 hot_pick_child(FT ft,
137                FTNODE parent,
138                void *extra)
139 {
140     struct hot_flusher_extra *flusher = (struct hot_flusher_extra *) extra;
141     int childnum = hot_just_pick_child(ft, parent, flusher);
142 
143     // Now we determine the percentage of the tree flushed so far.
144 
145     // Whichever subtree we choose to recurse into, it is a fraction
146     // of the current parent.
147     flusher->sub_tree_size /= parent->n_children;
148 
149     // Update the precentage complete, using our new sub tree size AND
150     // the number of children we have already flushed.
151     flusher->percentage_done += (flusher->sub_tree_size * childnum);
152 
153     hot_update_flusher_keys(parent, childnum, flusher);
154 
155     return childnum;
156 }
157 
158 // Does nothing for now.
159 static void
hot_update_status(FTNODE UU (child),int UU (dirtied),void * UU (extra))160 hot_update_status(FTNODE UU(child),
161                   int UU(dirtied),
162                   void *UU(extra))
163 {
164     return;
165 }
166 
167 // If we've just split a node, HOT needs another chance to decide which
168 // one to flush into.  This gives it a chance to do that, and update the
169 // keys it maintains.
170 static int
hot_pick_child_after_split(FT ft,FTNODE parent,int childnuma,int childnumb,void * extra)171 hot_pick_child_after_split(FT ft,
172                            FTNODE parent,
173                            int childnuma,
174                            int childnumb,
175                            void *extra)
176 {
177     struct hot_flusher_extra *flusher = (struct hot_flusher_extra *) extra;
178     int childnum = hot_just_pick_child(ft, parent, flusher);
179     assert(childnum == childnuma || childnum == childnumb);
180     hot_update_flusher_keys(parent, childnum, flusher);
181     if (parent->height == 1) {
182         // We don't want to recurse into a leaf node, but if we return
183         // anything valid, ft_split_child will try to go there, so we
184         // return -1 to allow ft_split_child to have its default
185         // behavior, which will be to stop recursing.
186         childnum = -1;
187     }
188     return childnum;
189 }
190 
191 // Basic constructor/initializer for the hot flusher struct.
192 static void
hot_flusher_init(struct flusher_advice * advice,struct hot_flusher_extra * flusher)193 hot_flusher_init(struct flusher_advice *advice,
194                  struct hot_flusher_extra *flusher)
195 {
196     // Initialize the highest pivot key seen to NULL.  This represents
197     // NEGATIVE INFINITY and is used to cover the special case of our
198     // first traversal of the tree.
199     toku_init_dbt(&(flusher->highest_pivot_key));
200     toku_init_dbt(&(flusher->max_current_key));
201     flusher->rightmost_leaf_seen = 0;
202     flusher->sub_tree_size = 1.0;
203     flusher->percentage_done = 0.0;
204     flusher_advice_init(advice,
205                         hot_pick_child,
206                         dont_destroy_basement_nodes,
207                         always_recursively_flush,
208                         default_merge_child,
209                         hot_update_status,
210                         hot_pick_child_after_split,
211                         flusher
212                         );
213 }
214 
215 // Erases any DBT keys we have copied from a traversal.
216 static void
hot_flusher_destroy(struct hot_flusher_extra * flusher)217 hot_flusher_destroy(struct hot_flusher_extra *flusher)
218 {
219     toku_destroy_dbt(&flusher->highest_pivot_key);
220     toku_destroy_dbt(&flusher->max_current_key);
221 }
222 
223 // Entry point for Hot Optimize Table (HOT).  Note, this function is
224 // not recursive.  It iterates over root-to-leaf paths.
225 int
toku_ft_hot_optimize(FT_HANDLE ft_handle,DBT * left,DBT * right,int (* progress_callback)(void * extra,float progress),void * progress_extra,uint64_t * loops_run)226 toku_ft_hot_optimize(FT_HANDLE ft_handle, DBT* left, DBT* right,
227                      int (*progress_callback)(void *extra, float progress),
228                      void *progress_extra, uint64_t* loops_run)
229 {
230     toku::context flush_ctx(CTX_FLUSH);
231 
232     int r = 0;
233     struct hot_flusher_extra flusher;
234     struct flusher_advice advice;
235 
236     hot_flusher_init(&advice, &flusher);
237     hot_set_start_key(&flusher, left);
238 
239     uint64_t loop_count = 0;
240     MSN msn_at_start_of_hot = ZERO_MSN;  // capture msn from root at
241                                          // start of HOT operation
242     (void) toku_sync_fetch_and_add(&HOT_STATUS_VAL(FT_HOT_NUM_STARTED), 1);
243 
244     toku_ft_note_hot_begin(ft_handle);
245 
246     // Higher level logic prevents a dictionary from being deleted or
247     // truncated during a hot optimize operation.  Doing so would violate
248     // the hot optimize contract.
249     do {
250         FTNODE root;
251         CACHEKEY root_key;
252         uint32_t fullhash;
253 
254         {
255             // Get root node (the first parent of each successive HOT
256             // call.)
257             toku_calculate_root_offset_pointer(ft_handle->ft, &root_key, &fullhash);
258             ftnode_fetch_extra bfe;
259             bfe.create_for_full_read(ft_handle->ft);
260             toku_pin_ftnode(ft_handle->ft,
261                             (BLOCKNUM) root_key,
262                             fullhash,
263                             &bfe,
264                             PL_WRITE_EXPENSIVE,
265                             &root,
266                             true);
267             toku_ftnode_assert_fully_in_memory(root);
268         }
269 
270         // Prepare HOT diagnostics.
271         if (loop_count == 0) {
272             // The first time through, capture msn from root
273             msn_at_start_of_hot = root->max_msn_applied_to_node_on_disk;
274         }
275 
276         loop_count++;
277 
278         if (loop_count > HOT_STATUS_VAL(FT_HOT_MAX_ROOT_FLUSH_COUNT)) {
279             HOT_STATUS_VAL(FT_HOT_MAX_ROOT_FLUSH_COUNT) = loop_count;
280         }
281 
282         // Initialize the maximum current key.  We need to do this for
283         // every traversal.
284         toku_destroy_dbt(&flusher.max_current_key);
285 
286         flusher.sub_tree_size = 1.0;
287         flusher.percentage_done = 0.0;
288 
289         // This should recurse to the bottom of the tree and then
290         // return.
291         if (root->height > 0) {
292             toku_ft_flush_some_child(ft_handle->ft, root, &advice);
293         } else {
294             // Since there are no children to flush, we should abort
295             // the HOT call.
296             flusher.rightmost_leaf_seen = 1;
297             toku_unpin_ftnode(ft_handle->ft, root);
298         }
299 
300         // Set the highest pivot key seen here, since the parent may
301         // be unlocked and NULL'd later in our caller:
302         // toku_ft_flush_some_child().
303         hot_set_highest_key(&flusher);
304 
305         // This is where we determine if the traversal is finished or
306         // not.
307         if (flusher.max_current_key.data == NULL) {
308             flusher.rightmost_leaf_seen = 1;
309         }
310         else if (right) {
311             // if we have flushed past the bounds set for us,
312             // set rightmost_leaf_seen so we exit
313             int cmp = ft_handle->ft->cmp(&flusher.max_current_key, right);
314             if (cmp > 0) {
315                 flusher.rightmost_leaf_seen = 1;
316             }
317         }
318 
319         // Update HOT's progress.
320         if (progress_callback != NULL) {
321             r = progress_callback(progress_extra, flusher.percentage_done);
322 
323             // Check if the callback wants us to stop running HOT.
324             if (r != 0) {
325                 flusher.rightmost_leaf_seen = 1;
326             }
327         }
328 
329         // Loop until the max key has been updated to positive
330         // infinity.
331     } while (!flusher.rightmost_leaf_seen);
332     *loops_run = loop_count;
333 
334     // Cleanup.
335     hot_flusher_destroy(&flusher);
336 
337     // More diagnostics.
338     {
339         bool success = false;
340         if (r == 0) { success = true; }
341 
342         {
343             toku_ft_note_hot_complete(ft_handle, success, msn_at_start_of_hot);
344         }
345 
346         if (success) {
347             (void) toku_sync_fetch_and_add(&HOT_STATUS_VAL(FT_HOT_NUM_COMPLETED), 1);
348         } else {
349             (void) toku_sync_fetch_and_add(&HOT_STATUS_VAL(FT_HOT_NUM_ABORTED), 1);
350         }
351     }
352     return r;
353 }
354 
355 #include <toku_race_tools.h>
356 void __attribute__((__constructor__)) toku_hot_helgrind_ignore(void);
357 void
toku_hot_helgrind_ignore(void)358 toku_hot_helgrind_ignore(void) {
359     // incremented only while lock is held, but read by engine status asynchronously.
360     TOKU_VALGRIND_HG_DISABLE_CHECKING(&hot_status, sizeof hot_status);
361 }
362