1 /*****************************************************************************
2 
3 Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2017, 2020, MariaDB Corporation.
5 
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9 
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 
18 *****************************************************************************/
19 
20 /**************************************************//**
21 @file ut/ut0stage.h
22 Supplementary code to performance schema stage instrumentation.
23 
24 Created Nov 12, 2014 Vasil Dimov
25 *******************************************************/
26 
27 #ifndef ut0stage_h
28 #define ut0stage_h
29 
30 #include <algorithm>
31 #include <math.h>
32 
33 #include "my_global.h" /* needed for headers from mysql/psi/ */
34 
35 #include "mysql/psi/mysql_stage.h" /* mysql_stage_inc_work_completed */
36 #include "mysql/psi/psi.h" /* HAVE_PSI_STAGE_INTERFACE, PSI_stage_progress */
37 
38 #include "dict0mem.h" /* dict_index_t */
39 #include "row0log.h" /* row_log_estimate_work() */
40 #include "srv0srv.h" /* ut_stage_alter_t */
41 
42 #ifdef HAVE_PSI_STAGE_INTERFACE
43 
44 /** Class used to report ALTER TABLE progress via performance_schema.
45 The only user of this class is the ALTER TABLE code and it calls the methods
46 in the following order
47 constructor
48 begin_phase_read_pk()
49   multiple times:
50     n_pk_recs_inc() // once per record read
51     inc() // once per page read
52 end_phase_read_pk()
53 if any new indexes are being added, for each one:
54   begin_phase_sort()
55     multiple times:
56       inc() // once per record sorted
57   begin_phase_insert()
58     multiple times:
59       inc() // once per record inserted
60   being_phase_log_index()
61     multiple times:
62       inc() // once per log-block applied
63 begin_phase_log_table()
64     multiple times:
65       inc() // once per log-block applied
66 begin_phase_end()
67 destructor
68 
69 This class knows the specifics of each phase and tries to increment the
70 progress in an even manner across the entire ALTER TABLE lifetime. */
71 class ut_stage_alter_t {
72 public:
73 	/** Constructor.
74 	@param[in]	pk	primary key of the old table */
75 	explicit
ut_stage_alter_t(const dict_index_t * pk)76 	ut_stage_alter_t(
77 		const dict_index_t*	pk)
78 		:
79 		m_progress(NULL),
80 		m_pk(pk),
81 		m_n_pk_recs(0),
82 		m_n_pk_pages(0),
83 		m_n_recs_processed(0),
84 		m_cur_phase(NOT_STARTED)
85 	{
86 	}
87 
88 	/** Destructor. */
89 	~ut_stage_alter_t();
90 
91 	/** Flag an ALTER TABLE start (read primary key phase).
92 	@param[in]	n_sort_indexes	number of indexes that will be sorted
93 	during ALTER TABLE, used for estimating the total work to be done */
94 	void
95 	begin_phase_read_pk(
96 		ulint	n_sort_indexes);
97 
98 	/** Increment the number of records in PK (table) with 1.
99 	This is used to get more accurate estimate about the number of
100 	records per page which is needed because some phases work on
101 	per-page basis while some work on per-record basis and we want
102 	to get the progress as even as possible. */
103 	void
104 	n_pk_recs_inc();
105 
106 	/** Flag either one record or one page processed, depending on the
107 	current phase.
108 	@param[in]	inc_val	flag this many units processed at once */
109 	void
110 	inc(
111 		ulint	inc_val = 1);
112 
113 	/** Flag the end of reading of the primary key.
114 	Here we know the exact number of pages and records and calculate
115 	the number of records per page and refresh the estimate. */
116 	void
117 	end_phase_read_pk();
118 
119 	/** Flag the beginning of the sort phase.
120 	@param[in]	sort_multi_factor	since merge sort processes
121 	one page more than once we only update the estimate once per this
122 	many pages processed. */
123 	void
124 	begin_phase_sort(
125 		double	sort_multi_factor);
126 
127 	/** Flag the beginning of the insert phase. */
128 	void
129 	begin_phase_insert();
130 
131 	/** Flag the beginning of the log index phase. */
132 	void
133 	begin_phase_log_index();
134 
135 	/** Flag the beginning of the log table phase. */
136 	void
137 	begin_phase_log_table();
138 
139 	/** Flag the beginning of the end phase. */
140 	void
141 	begin_phase_end();
142 
143 private:
144 
145 	/** Update the estimate of total work to be done. */
146 	void
147 	reestimate();
148 
149 	/** Change the current phase.
150 	@param[in]	new_stage	pointer to the new stage to change to */
151 	void
152 	change_phase(
153 		const PSI_stage_info*	new_stage);
154 
155 	/** Performance schema accounting object. */
156 	PSI_stage_progress*	m_progress;
157 
158 	/** Old table PK. Used for calculating the estimate. */
159 	const dict_index_t*	m_pk;
160 
161 	/** Number of records in the primary key (table), including delete
162 	marked records. */
163 	ulint			m_n_pk_recs;
164 
165 	/** Number of leaf pages in the primary key. */
166 	ulint			m_n_pk_pages;
167 
168 	/** Estimated number of records per page in the primary key. */
169 	double			m_n_recs_per_page;
170 
171 	/** Number of indexes that are being added. */
172 	ulint			m_n_sort_indexes;
173 
174 	/** During the sort phase, increment the counter once per this
175 	many pages processed. This is because sort processes one page more
176 	than once. */
177 	ulint			m_sort_multi_factor;
178 
179 	/** Number of records processed during sort & insert phases. We
180 	need to increment the counter only once page, or once per
181 	recs-per-page records. */
182 	ulint			m_n_recs_processed;
183 
184 	/** Current phase. */
185 	enum {
186 		NOT_STARTED = 0,
187 		READ_PK = 1,
188 		SORT = 2,
189 		INSERT = 3,
190 		/* JAN: TODO: MySQL 5.7 vrs. MariaDB sql/log.h
191 		LOG_INDEX = 5,
192 		LOG_TABLE = 6, */
193 		LOG_INNODB_INDEX = 5,
194 		LOG_INNODB_TABLE = 6,
195 		END = 7,
196 	}			m_cur_phase;
197 };
198 
199 /** Destructor. */
200 inline
~ut_stage_alter_t()201 ut_stage_alter_t::~ut_stage_alter_t()
202 {
203 	if (m_progress == NULL) {
204 		return;
205 	}
206 
207 	/* Set completed = estimated before we quit. */
208 	mysql_stage_set_work_completed(
209 		m_progress,
210 		mysql_stage_get_work_estimated(m_progress));
211 
212 	mysql_end_stage();
213 }
214 
215 /** Flag an ALTER TABLE start (read primary key phase).
216 @param[in]	n_sort_indexes	number of indexes that will be sorted
217 during ALTER TABLE, used for estimating the total work to be done */
218 inline
219 void
begin_phase_read_pk(ulint n_sort_indexes)220 ut_stage_alter_t::begin_phase_read_pk(
221 	ulint	n_sort_indexes)
222 {
223 	m_n_sort_indexes = n_sort_indexes;
224 
225 	m_cur_phase = READ_PK;
226 
227 	m_progress = mysql_set_stage(
228 		srv_stage_alter_table_read_pk_internal_sort.m_key);
229 
230 	mysql_stage_set_work_completed(m_progress, 0);
231 	reestimate();
232 }
233 
234 /** Increment the number of records in PK (table) with 1.
235 This is used to get more accurate estimate about the number of
236 records per page which is needed because some phases work on
237 per-page basis while some work on per-record basis and we want
238 to get the progress as even as possible. */
239 inline
240 void
n_pk_recs_inc()241 ut_stage_alter_t::n_pk_recs_inc()
242 {
243 	m_n_pk_recs++;
244 }
245 
246 /** Flag either one record or one page processed, depending on the
247 current phase. */
248 inline
249 void
inc(ulint inc_val)250 ut_stage_alter_t::inc(ulint inc_val)
251 {
252 	if (m_progress == NULL) {
253 		return;
254 	}
255 
256 	ulint	multi_factor = 1;
257 	bool	should_proceed = true;
258 
259 	switch (m_cur_phase) {
260 	case NOT_STARTED:
261 		ut_error;
262 	case READ_PK:
263 		m_n_pk_pages++;
264 		ut_ad(inc_val == 1);
265 		/* Overall the read pk phase will read all the pages from the
266 		PK and will do work, proportional to the number of added
267 		indexes, thus when this is called once per read page we
268 		increment with 1 + m_n_sort_indexes */
269 		inc_val = 1 + m_n_sort_indexes;
270 		break;
271 	case SORT:
272 		multi_factor = m_sort_multi_factor;
273 		/* fall through */
274 	case INSERT: {
275 		/* Increment the progress every nth record. During
276 		sort and insert phases, this method is called once per
277 		record processed. We need fractional point numbers here
278 		because "records per page" is such a number naturally and
279 		to avoid rounding skew we want, for example: if there are
280 		(double) N records per page, then the work_completed
281 	        should be incremented on the inc() calls round(k*N),
282 		for k=1,2,3... */
283 		const double	every_nth = m_n_recs_per_page *
284 			static_cast<double>(multi_factor);
285 
286 		const ulint	k = static_cast<ulint>(
287 			round(static_cast<double>(m_n_recs_processed) /
288 			      every_nth));
289 
290 		const ulint	nth = static_cast<ulint>(
291 			round(static_cast<double>(k) * every_nth));
292 
293 		should_proceed = m_n_recs_processed == nth;
294 
295 		m_n_recs_processed++;
296 
297 		break;
298 	}
299 	/* JAN: TODO: MySQL 5.7
300 	case LOG_INDEX:
301 		break;
302 	case LOG_TABLE:
303 	break; */
304 	case LOG_INNODB_INDEX:
305 	case LOG_INNODB_TABLE:
306 		break;
307 	case END:
308 		break;
309 	}
310 
311 	if (should_proceed) {
312 		mysql_stage_inc_work_completed(m_progress, inc_val);
313 		reestimate();
314 	}
315 }
316 
317 /** Flag the end of reading of the primary key.
318 Here we know the exact number of pages and records and calculate
319 the number of records per page and refresh the estimate. */
320 inline
321 void
end_phase_read_pk()322 ut_stage_alter_t::end_phase_read_pk()
323 {
324 	reestimate();
325 
326 	if (m_n_pk_pages == 0) {
327 		/* The number of pages in the PK could be 0 if the tree is
328 		empty. In this case we set m_n_recs_per_page to 1 to avoid
329 		division by zero later. */
330 		m_n_recs_per_page = 1.0;
331 	} else {
332 		m_n_recs_per_page = std::max(
333 			static_cast<double>(m_n_pk_recs)
334 			/ static_cast<double>(m_n_pk_pages),
335 			1.0);
336 	}
337 }
338 
339 /** Flag the beginning of the sort phase.
340 @param[in]	sort_multi_factor	since merge sort processes
341 one page more than once we only update the estimate once per this
342 many pages processed. */
343 inline
344 void
begin_phase_sort(double sort_multi_factor)345 ut_stage_alter_t::begin_phase_sort(
346 	double	sort_multi_factor)
347 {
348 	if (sort_multi_factor <= 1.0) {
349 		m_sort_multi_factor = 1;
350 	} else {
351 		m_sort_multi_factor = static_cast<ulint>(
352 			round(sort_multi_factor));
353 	}
354 
355 	change_phase(&srv_stage_alter_table_merge_sort);
356 }
357 
358 /** Flag the beginning of the insert phase. */
359 inline
360 void
begin_phase_insert()361 ut_stage_alter_t::begin_phase_insert()
362 {
363 	change_phase(&srv_stage_alter_table_insert);
364 }
365 
366 /** Flag the beginning of the log index phase. */
367 inline
368 void
begin_phase_log_index()369 ut_stage_alter_t::begin_phase_log_index()
370 {
371 	change_phase(&srv_stage_alter_table_log_index);
372 }
373 
374 /** Flag the beginning of the log table phase. */
375 inline
376 void
begin_phase_log_table()377 ut_stage_alter_t::begin_phase_log_table()
378 {
379 	change_phase(&srv_stage_alter_table_log_table);
380 }
381 
382 /** Flag the beginning of the end phase. */
383 inline
384 void
begin_phase_end()385 ut_stage_alter_t::begin_phase_end()
386 {
387 	change_phase(&srv_stage_alter_table_end);
388 }
389 
390 /** Update the estimate of total work to be done. */
391 inline
392 void
reestimate()393 ut_stage_alter_t::reestimate()
394 {
395 	if (m_progress == NULL) {
396 		return;
397 	}
398 
399 	/* During the log table phase we calculate the estimate as
400 	work done so far + log size remaining. */
401 	if (m_cur_phase == LOG_INNODB_TABLE) {
402 		mysql_stage_set_work_estimated(
403 			m_progress,
404 			mysql_stage_get_work_completed(m_progress)
405 			+ row_log_estimate_work(m_pk));
406 		return;
407 	}
408 
409 	/* During the other phases we use a formula, regardless of
410 	how much work has been done so far. */
411 
412 	/* For number of pages in the PK - if the PK has not been
413 	read yet, use stat_n_leaf_pages (approximate), otherwise
414 	use the exact number we gathered. */
415 	const ulint	n_pk_pages
416 		= m_cur_phase != READ_PK
417 		? m_n_pk_pages
418 		: m_pk->stat_n_leaf_pages;
419 
420 	ulonglong	estimate __attribute__((unused))
421 		= n_pk_pages
422 		* (1 /* read PK */
423 		   + m_n_sort_indexes /* row_merge_buf_sort() inside the
424 				      read PK per created index */
425 		   + m_n_sort_indexes * 2 /* sort & insert per created index */)
426 		+ row_log_estimate_work(m_pk);
427 
428 	/* Prevent estimate < completed */
429 	estimate = std::max(estimate,
430 			    mysql_stage_get_work_completed(m_progress));
431 
432 	mysql_stage_set_work_estimated(m_progress, estimate);
433 }
434 
435 /** Change the current phase.
436 @param[in]	new_stage	pointer to the new stage to change to */
437 inline
438 void
change_phase(const PSI_stage_info * new_stage)439 ut_stage_alter_t::change_phase(
440 	const PSI_stage_info*	new_stage)
441 {
442 	if (m_progress == NULL) {
443 		return;
444 	}
445 
446 	if (new_stage == &srv_stage_alter_table_read_pk_internal_sort) {
447 		m_cur_phase = READ_PK;
448 	} else if (new_stage == &srv_stage_alter_table_merge_sort) {
449 		m_cur_phase = SORT;
450 	} else if (new_stage == &srv_stage_alter_table_insert) {
451 		m_cur_phase = INSERT;
452 	/* JAN: TODO: MySQL 5.7 used LOG_INDEX and LOG_TABLE */
453 	} else if (new_stage == &srv_stage_alter_table_log_index) {
454 		m_cur_phase = LOG_INNODB_INDEX;
455 	} else if (new_stage == &srv_stage_alter_table_log_table) {
456 		m_cur_phase = LOG_INNODB_TABLE;
457 	} else if (new_stage == &srv_stage_alter_table_end) {
458 		m_cur_phase = END;
459 	} else {
460 		ut_error;
461 	}
462 
463 	const ulonglong	c = mysql_stage_get_work_completed(m_progress);
464 	const ulonglong	e = mysql_stage_get_work_estimated(m_progress);
465 
466 	m_progress = mysql_set_stage(new_stage->m_key);
467 
468 	mysql_stage_set_work_completed(m_progress, c);
469 	mysql_stage_set_work_estimated(m_progress, e);
470 }
471 #else /* HAVE_PSI_STAGE_INTERFACE */
472 
473 class ut_stage_alter_t {
474 public:
ut_stage_alter_t(const dict_index_t *)475 	explicit ut_stage_alter_t(const dict_index_t*) {}
476 
begin_phase_read_pk(ulint)477 	void begin_phase_read_pk(ulint)	{}
478 
n_pk_recs_inc()479 	void n_pk_recs_inc() {}
480 
inc()481 	void inc() {}
inc(ulint)482 	void inc(ulint) {}
483 
end_phase_read_pk()484 	void end_phase_read_pk() {}
485 
begin_phase_sort(double)486 	void begin_phase_sort(double) {}
487 
begin_phase_insert()488 	void begin_phase_insert() {}
489 
begin_phase_log_index()490 	void begin_phase_log_index() {}
491 
begin_phase_log_table()492 	void begin_phase_log_table() {}
493 
begin_phase_end()494 	void begin_phase_end() {}
495 };
496 
497 #endif /* HAVE_PSI_STAGE_INTERFACE */
498 
499 #endif /* ut0stage_h */
500