1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of PerconaFT.
6
7
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 PerconaFT is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 PerconaFT is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
21
22 ----------------------------------------
23
24 PerconaFT is free software: you can redistribute it and/or modify
25 it under the terms of the GNU Affero General Public License, version 3,
26 as published by the Free Software Foundation.
27
28 PerconaFT is distributed in the hope that it will be useful,
29 but WITHOUT ANY WARRANTY; without even the implied warranty of
30 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 GNU Affero General Public License for more details.
32
33 You should have received a copy of the GNU Affero General Public License
34 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
35 ======= */
36
37 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
38
39 #include "ft/ft.h"
40 #include "ft/ft-cachetable-wrappers.h"
41 #include "ft/ft-flusher.h"
42 #include "ft/ft-flusher-internal.h"
43 #include "ft/ft-internal.h"
44 #include "ft/node.h"
45 #include "portability/toku_atomic.h"
46 #include "util/context.h"
47 #include "util/status.h"
48
49 // Member Descirption:
50 // 1. highest_pivot_key - this is the key that corresponds to the
51 // most recently flushed leaf entry.
52 // 2. max_current_key - this is the pivot/key that we inherit as
53 // we descend down the tree. We use this to set the highest_pivot_key.
54 // 3. sub_tree_size - this is the percentage of the entire tree that our
55 // current position (in a sub-tree) encompasses.
56 // 4. percentage_done - this is the percentage of leaf nodes that have
57 // been flushed into.
58 // 5. rightmost_leaf_seen - this is a boolean we use to determine if
59 // if we have flushed to every leaf node.
60 struct hot_flusher_extra {
61 DBT highest_pivot_key;
62 DBT max_current_key;
63 float sub_tree_size;
64 float percentage_done;
65 bool rightmost_leaf_seen;
66 };
67
68 void
toku_ft_hot_get_status(FT_HOT_STATUS s)69 toku_ft_hot_get_status(FT_HOT_STATUS s) {
70 hot_status.init();
71 *s = hot_status;
72 }
73
74 // Copies the max current key to the highest pivot key seen.
75 static void
hot_set_highest_key(struct hot_flusher_extra * flusher)76 hot_set_highest_key(struct hot_flusher_extra *flusher)
77 {
78 // The max current key will be NULL if we are traversing in the
79 // rightmost subtree of a given parent. As such, we don't want to
80 // allocate memory for this case.
81 toku_destroy_dbt(&flusher->highest_pivot_key);
82 if (flusher->max_current_key.data != NULL) {
83 // Otherwise, let's copy all the contents from one key to the other.
84 toku_clone_dbt(&flusher->highest_pivot_key, flusher->max_current_key);
85 }
86 }
87
88 static void
hot_set_start_key(struct hot_flusher_extra * flusher,const DBT * start)89 hot_set_start_key(struct hot_flusher_extra *flusher, const DBT* start)
90 {
91 toku_destroy_dbt(&flusher->highest_pivot_key);
92 if (start != NULL) {
93 // Otherwise, let's copy all the contents from one key to the other.
94 toku_clone_dbt(&flusher->highest_pivot_key, *start);
95 }
96 }
97
98 static int
hot_just_pick_child(FT ft,FTNODE parent,struct hot_flusher_extra * flusher)99 hot_just_pick_child(FT ft,
100 FTNODE parent,
101 struct hot_flusher_extra *flusher)
102 {
103 int childnum = 0;
104
105 // Search through Parents pivots, see which one is greater than
106 // the highest_pivot_key seen so far.
107 if (flusher->highest_pivot_key.data == NULL)
108 {
109 // Special case of the first child of the root node.
110 // Also known as, NEGATIVE INFINITY....
111 childnum = 0;
112 } else {
113 // Find the pivot boundary.
114 childnum = toku_ftnode_hot_next_child(parent, &flusher->highest_pivot_key, ft->cmp);
115 }
116
117 return childnum;
118 }
119
120 static void
hot_update_flusher_keys(FTNODE parent,int childnum,struct hot_flusher_extra * flusher)121 hot_update_flusher_keys(FTNODE parent,
122 int childnum,
123 struct hot_flusher_extra *flusher)
124 {
125 // Update maximum current key if the child is NOT the rightmost
126 // child node.
127 if (childnum < (parent->n_children - 1)) {
128 toku_destroy_dbt(&flusher->max_current_key);
129 toku_clone_dbt(&flusher->max_current_key, parent->pivotkeys.get_pivot(childnum));
130 }
131 }
132
133 // Picks which child toku_ft_flush_some_child will use for flushing and
134 // recursion.
135 static int
hot_pick_child(FT ft,FTNODE parent,void * extra)136 hot_pick_child(FT ft,
137 FTNODE parent,
138 void *extra)
139 {
140 struct hot_flusher_extra *flusher = (struct hot_flusher_extra *) extra;
141 int childnum = hot_just_pick_child(ft, parent, flusher);
142
143 // Now we determine the percentage of the tree flushed so far.
144
145 // Whichever subtree we choose to recurse into, it is a fraction
146 // of the current parent.
147 flusher->sub_tree_size /= parent->n_children;
148
149 // Update the precentage complete, using our new sub tree size AND
150 // the number of children we have already flushed.
151 flusher->percentage_done += (flusher->sub_tree_size * childnum);
152
153 hot_update_flusher_keys(parent, childnum, flusher);
154
155 return childnum;
156 }
157
158 // Does nothing for now.
159 static void
hot_update_status(FTNODE UU (child),int UU (dirtied),void * UU (extra))160 hot_update_status(FTNODE UU(child),
161 int UU(dirtied),
162 void *UU(extra))
163 {
164 return;
165 }
166
167 // If we've just split a node, HOT needs another chance to decide which
168 // one to flush into. This gives it a chance to do that, and update the
169 // keys it maintains.
170 static int
hot_pick_child_after_split(FT ft,FTNODE parent,int childnuma,int childnumb,void * extra)171 hot_pick_child_after_split(FT ft,
172 FTNODE parent,
173 int childnuma,
174 int childnumb,
175 void *extra)
176 {
177 struct hot_flusher_extra *flusher = (struct hot_flusher_extra *) extra;
178 int childnum = hot_just_pick_child(ft, parent, flusher);
179 assert(childnum == childnuma || childnum == childnumb);
180 hot_update_flusher_keys(parent, childnum, flusher);
181 if (parent->height == 1) {
182 // We don't want to recurse into a leaf node, but if we return
183 // anything valid, ft_split_child will try to go there, so we
184 // return -1 to allow ft_split_child to have its default
185 // behavior, which will be to stop recursing.
186 childnum = -1;
187 }
188 return childnum;
189 }
190
191 // Basic constructor/initializer for the hot flusher struct.
192 static void
hot_flusher_init(struct flusher_advice * advice,struct hot_flusher_extra * flusher)193 hot_flusher_init(struct flusher_advice *advice,
194 struct hot_flusher_extra *flusher)
195 {
196 // Initialize the highest pivot key seen to NULL. This represents
197 // NEGATIVE INFINITY and is used to cover the special case of our
198 // first traversal of the tree.
199 toku_init_dbt(&(flusher->highest_pivot_key));
200 toku_init_dbt(&(flusher->max_current_key));
201 flusher->rightmost_leaf_seen = 0;
202 flusher->sub_tree_size = 1.0;
203 flusher->percentage_done = 0.0;
204 flusher_advice_init(advice,
205 hot_pick_child,
206 dont_destroy_basement_nodes,
207 always_recursively_flush,
208 default_merge_child,
209 hot_update_status,
210 hot_pick_child_after_split,
211 flusher
212 );
213 }
214
215 // Erases any DBT keys we have copied from a traversal.
216 static void
hot_flusher_destroy(struct hot_flusher_extra * flusher)217 hot_flusher_destroy(struct hot_flusher_extra *flusher)
218 {
219 toku_destroy_dbt(&flusher->highest_pivot_key);
220 toku_destroy_dbt(&flusher->max_current_key);
221 }
222
223 // Entry point for Hot Optimize Table (HOT). Note, this function is
224 // not recursive. It iterates over root-to-leaf paths.
225 int
toku_ft_hot_optimize(FT_HANDLE ft_handle,DBT * left,DBT * right,int (* progress_callback)(void * extra,float progress),void * progress_extra,uint64_t * loops_run)226 toku_ft_hot_optimize(FT_HANDLE ft_handle, DBT* left, DBT* right,
227 int (*progress_callback)(void *extra, float progress),
228 void *progress_extra, uint64_t* loops_run)
229 {
230 toku::context flush_ctx(CTX_FLUSH);
231
232 int r = 0;
233 struct hot_flusher_extra flusher;
234 struct flusher_advice advice;
235
236 hot_flusher_init(&advice, &flusher);
237 hot_set_start_key(&flusher, left);
238
239 uint64_t loop_count = 0;
240 MSN msn_at_start_of_hot = ZERO_MSN; // capture msn from root at
241 // start of HOT operation
242 (void) toku_sync_fetch_and_add(&HOT_STATUS_VAL(FT_HOT_NUM_STARTED), 1);
243
244 toku_ft_note_hot_begin(ft_handle);
245
246 // Higher level logic prevents a dictionary from being deleted or
247 // truncated during a hot optimize operation. Doing so would violate
248 // the hot optimize contract.
249 do {
250 FTNODE root;
251 CACHEKEY root_key;
252 uint32_t fullhash;
253
254 {
255 // Get root node (the first parent of each successive HOT
256 // call.)
257 toku_calculate_root_offset_pointer(ft_handle->ft, &root_key, &fullhash);
258 ftnode_fetch_extra bfe;
259 bfe.create_for_full_read(ft_handle->ft);
260 toku_pin_ftnode(ft_handle->ft,
261 (BLOCKNUM) root_key,
262 fullhash,
263 &bfe,
264 PL_WRITE_EXPENSIVE,
265 &root,
266 true);
267 toku_ftnode_assert_fully_in_memory(root);
268 }
269
270 // Prepare HOT diagnostics.
271 if (loop_count == 0) {
272 // The first time through, capture msn from root
273 msn_at_start_of_hot = root->max_msn_applied_to_node_on_disk;
274 }
275
276 loop_count++;
277
278 if (loop_count > HOT_STATUS_VAL(FT_HOT_MAX_ROOT_FLUSH_COUNT)) {
279 HOT_STATUS_VAL(FT_HOT_MAX_ROOT_FLUSH_COUNT) = loop_count;
280 }
281
282 // Initialize the maximum current key. We need to do this for
283 // every traversal.
284 toku_destroy_dbt(&flusher.max_current_key);
285
286 flusher.sub_tree_size = 1.0;
287 flusher.percentage_done = 0.0;
288
289 // This should recurse to the bottom of the tree and then
290 // return.
291 if (root->height > 0) {
292 toku_ft_flush_some_child(ft_handle->ft, root, &advice);
293 } else {
294 // Since there are no children to flush, we should abort
295 // the HOT call.
296 flusher.rightmost_leaf_seen = 1;
297 toku_unpin_ftnode(ft_handle->ft, root);
298 }
299
300 // Set the highest pivot key seen here, since the parent may
301 // be unlocked and NULL'd later in our caller:
302 // toku_ft_flush_some_child().
303 hot_set_highest_key(&flusher);
304
305 // This is where we determine if the traversal is finished or
306 // not.
307 if (flusher.max_current_key.data == NULL) {
308 flusher.rightmost_leaf_seen = 1;
309 }
310 else if (right) {
311 // if we have flushed past the bounds set for us,
312 // set rightmost_leaf_seen so we exit
313 int cmp = ft_handle->ft->cmp(&flusher.max_current_key, right);
314 if (cmp > 0) {
315 flusher.rightmost_leaf_seen = 1;
316 }
317 }
318
319 // Update HOT's progress.
320 if (progress_callback != NULL) {
321 r = progress_callback(progress_extra, flusher.percentage_done);
322
323 // Check if the callback wants us to stop running HOT.
324 if (r != 0) {
325 flusher.rightmost_leaf_seen = 1;
326 }
327 }
328
329 // Loop until the max key has been updated to positive
330 // infinity.
331 } while (!flusher.rightmost_leaf_seen);
332 *loops_run = loop_count;
333
334 // Cleanup.
335 hot_flusher_destroy(&flusher);
336
337 // More diagnostics.
338 {
339 bool success = false;
340 if (r == 0) { success = true; }
341
342 {
343 toku_ft_note_hot_complete(ft_handle, success, msn_at_start_of_hot);
344 }
345
346 if (success) {
347 (void) toku_sync_fetch_and_add(&HOT_STATUS_VAL(FT_HOT_NUM_COMPLETED), 1);
348 } else {
349 (void) toku_sync_fetch_and_add(&HOT_STATUS_VAL(FT_HOT_NUM_ABORTED), 1);
350 }
351 }
352 return r;
353 }
354
355 #include <toku_race_tools.h>
356 void __attribute__((__constructor__)) toku_hot_helgrind_ignore(void);
357 void
toku_hot_helgrind_ignore(void)358 toku_hot_helgrind_ignore(void) {
359 // incremented only while lock is held, but read by engine status asynchronously.
360 TOKU_VALGRIND_HG_DISABLE_CHECKING(&hot_status, sizeof hot_status);
361 }
362