1 /*****************************************************************************
2 
3 Copyright (c) 2014, 2021, Oracle and/or its affiliates.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /** @file ha_innopart.cc
28 Code for native partitioning in InnoDB.
29 
30 Created Nov 22, 2013 Mattias Jonsson */
31 
32 #include "univ.i"
33 
34 /* Include necessary SQL headers */
35 #include <debug_sync.h>
36 #include <log.h>
37 #include <strfunc.h>
38 #include <sql_acl.h>
39 #include <sql_class.h>
40 #include <sql_show.h>
41 #include <sql_table.h>
42 #include <my_check_opt.h>
43 
44 /* Include necessary InnoDB headers */
45 #include "btr0sea.h"
46 #include "dict0dict.h"
47 #include "dict0stats.h"
48 #include "lock0lock.h"
49 #include "row0import.h"
50 #include "row0merge.h"
51 #include "row0mysql.h"
52 #include "row0quiesce.h"
53 #include "row0sel.h"
54 #include "row0ins.h"
55 #include "row0upd.h"
56 #include "fsp0sysspace.h"
57 #include "ut0ut.h"
58 
59 #include "ha_innodb.h"
60 #include "ha_innopart.h"
61 #include "partition_info.h"
62 #include "key.h"
63 #include "dict0priv.h"
64 
65 #define INSIDE_HA_INNOPART_CC
66 
67 /* To be backwards compatible we also fold partition separator on windows. */
68 #ifdef _WIN32
69 const char* part_sep = "#p#";
70 const char* sub_sep = "#sp#";
71 #else
72 const char* part_sep = "#P#";
73 const char* sub_sep = "#SP#";
74 #endif /* _WIN32 */
75 
76 /* Partition separator for *nix platforms */
77 const char* part_sep_nix = "#P#";
78 const char* sub_sep_nix = "#SP#";
79 
80 extern char*	innobase_file_format_max;
81 
Ha_innopart_share(TABLE_SHARE * table_share)82 Ha_innopart_share::Ha_innopart_share(
83 	TABLE_SHARE*	table_share)
84 	:
85 	Partition_share(),
86 	m_table_parts(),
87 	m_index_mapping(),
88 	m_tot_parts(),
89 	m_index_count(),
90 	m_ref_count(),
91 	m_table_share(table_share)
92 {}
93 
~Ha_innopart_share()94 Ha_innopart_share::~Ha_innopart_share()
95 {
96 	ut_ad(m_ref_count == 0);
97 	if (m_table_parts != NULL) {
98 		ut_free(m_table_parts);
99 		m_table_parts = NULL;
100 	}
101 	if (m_index_mapping != NULL) {
102 		ut_free(m_index_mapping);
103 		m_index_mapping = NULL;
104 	}
105 }
106 
107 /** Fold to lower case if windows or lower_case_table_names == 1.
108 @param[in,out]	s	String to fold.*/
109 void
partition_name_casedn_str(char * s)110 Ha_innopart_share::partition_name_casedn_str(
111 	char*	s)
112 {
113 #ifdef _WIN32
114 	innobase_casedn_str(s);
115 #endif
116 }
117 
118 /** Translate and append partition name.
119 @param[out]	to	String to write in filesystem charset
120 @param[in]	from	Name in system charset
121 @param[in]	sep	Separator
122 @param[in]	len	Max length of to buffer
123 @return	length of written string. */
124 size_t
append_sep_and_name(char * to,const char * from,const char * sep,size_t len)125 Ha_innopart_share::append_sep_and_name(
126 	char*		to,
127 	const char*	from,
128 	const char*	sep,
129 	size_t		len)
130 {
131 	size_t	ret;
132 	size_t	sep_len = strlen(sep);
133 
134 	ut_ad(len > sep_len + strlen(from));
135 	ut_ad(to != NULL);
136 	ut_ad(from != NULL);
137 	ut_ad(from[0] != '\0');
138 	memcpy(to, sep, sep_len);
139 
140 	ret = tablename_to_filename(from, to + sep_len,
141 		len - sep_len);
142 
143 	/* Don't convert to lower case for nix style name. */
144 	if (strcmp(sep, part_sep_nix) != 0
145 	    && strcmp(sep, sub_sep_nix) != 0) {
146 
147 		partition_name_casedn_str(to);
148 	}
149 
150 	return(ret + sep_len);
151 }
152 
153 /** Copy a cached MySQL row.
154 If requested, also avoids overwriting non-read columns.
155 @param[out]	buf		Row in MySQL format.
156 @param[in]	cached_row	Which row to copy. */
157 inline
158 void
copy_cached_row(uchar * buf,const uchar * cached_row)159 ha_innopart::copy_cached_row(
160 	uchar*		buf,
161 	const uchar*	cached_row)
162 {
163 	if (m_prebuilt->keep_other_fields_on_keyread) {
164 		row_sel_copy_cached_fields_for_mysql(buf, cached_row,
165 			m_prebuilt);
166 	} else {
167 		memcpy(buf, cached_row, m_rec_length);
168 	}
169 }
170 
171 /** Open one partition.
172 @param[in]	part_id		Partition id to open.
173 @param[in]	partition_name	Name of internal innodb table to open.
174 @return	false on success else true. */
175 bool
open_one_table_part(uint part_id,const char * partition_name)176 Ha_innopart_share::open_one_table_part(
177 	uint		part_id,
178 	const char*	partition_name)
179 {
180 	char	norm_name[FN_REFLEN];
181 
182 	normalize_table_name(norm_name, partition_name);
183 	m_table_parts[part_id] =
184 		ha_innobase::open_dict_table(partition_name, norm_name,
185 					     TRUE, DICT_ERR_IGNORE_NONE);
186 
187 	if (m_table_parts[part_id] == NULL) {
188 		return(true);
189 	}
190 
191 	dict_table_t *ib_table = m_table_parts[part_id];
192 	if ((!DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID)
193 	     && m_table_share->fields
194 		 != (dict_table_get_n_user_cols(ib_table)
195 		     + dict_table_get_n_v_cols(ib_table)))
196 	    || (DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID)
197 		&& (m_table_share->fields
198 		    != dict_table_get_n_user_cols(ib_table)
199 		       + dict_table_get_n_v_cols(ib_table) - 1))) {
200 		ib::warn() << "Partition `" << get_partition_name(part_id)
201 			<< "` contains " << dict_table_get_n_user_cols(ib_table)
202 			<< " user defined columns in InnoDB, but "
203 			<< m_table_share->fields
204 			<< " columns in MySQL. Please check"
205 			" INFORMATION_SCHEMA.INNODB_SYS_COLUMNS and " REFMAN
206 			"innodb-troubleshooting.html for how to resolve the"
207 			" issue.";
208 
209 		/* Mark this partition as corrupted, so the drop table
210 		or force recovery can still use it, but not others.
211 		TODO: persist table->corrupted so it will be retained on
212 		restart and out-of-bounds operations will see it. */
213 
214 		ib_table->corrupted = true;
215 		dict_table_close(ib_table, FALSE, FALSE);
216 	}
217 
218 	/* TODO: To save memory, compare with first partition and reuse
219 	the column names etc. in the internal InnoDB meta-data cache. */
220 
221 	return(false);
222 }
223 
224 /** Set up the virtual column template for partition table, and points
225 all m_table_parts[]->vc_templ to it.
226 @param[in]	table		MySQL TABLE object
227 @param[in]	ib_table	InnoDB dict_table_t
228 @param[in]	table_name	Table name (db/table_name) */
229 void
set_v_templ(TABLE * table,dict_table_t * ib_table,const char * name)230 Ha_innopart_share::set_v_templ(
231 	TABLE*		table,
232 	dict_table_t*	ib_table,
233 	const char*	name)
234 {
235 	ut_ad(mutex_own(&dict_sys->mutex));
236 
237 	if (ib_table->n_v_cols > 0) {
238 		for (ulint i = 0; i < m_tot_parts; i++) {
239 			if (m_table_parts[i]->vc_templ == NULL) {
240 				m_table_parts[i]->vc_templ
241 					= UT_NEW_NOKEY(dict_vcol_templ_t());
242 				m_table_parts[i]->vc_templ->vtempl = NULL;
243 			} else if (m_table_parts[i]->get_ref_count() == 1) {
244 				/* Clean and refresh the template */
245 				dict_free_vc_templ(m_table_parts[i]->vc_templ);
246 				m_table_parts[i]->vc_templ->vtempl = NULL;
247 			}
248 
249 			if (m_table_parts[i]->vc_templ->vtempl == NULL) {
250 				innobase_build_v_templ(
251 					table, ib_table,
252 					m_table_parts[i]->vc_templ,
253 					NULL, true, name);
254 			}
255 		}
256 	}
257 }
258 
259 /** Initialize the share with table and indexes per partition.
260 @param[in]	part_info	Partition info (partition names to use).
261 @param[in]	table_name	Table name (db/table_name).
262 @return	false on success else true. */
263 bool
open_table_parts(partition_info * part_info,const char * table_name)264 Ha_innopart_share::open_table_parts(
265 	partition_info*	part_info,
266 	const char*	table_name)
267 {
268 	size_t	table_name_len;
269 	size_t	len;
270 	uint	ib_num_index;
271 	uint	mysql_num_index;
272 	char	partition_name[FN_REFLEN];
273 	bool	index_loaded = true;
274 
275 #ifndef NDEBUG
276 	if (m_table_share->tmp_table == NO_TMP_TABLE) {
277 		mysql_mutex_assert_owner(&m_table_share->LOCK_ha_data);
278 	}
279 #endif /* NDEBUG */
280 	m_ref_count++;
281 	if (m_table_parts != NULL) {
282 		ut_ad(m_ref_count > 1);
283 		ut_ad(m_tot_parts > 0);
284 
285 		/* Increment dict_table_t reference count for all partitions */
286 		mutex_enter(&dict_sys->mutex);
287 		for (uint i = 0; i < m_tot_parts; i++) {
288 			dict_table_t*	table = m_table_parts[i];
289 			table->acquire();
290 			ut_ad(table->get_ref_count() >= m_ref_count);
291 		}
292 		mutex_exit(&dict_sys->mutex);
293 
294 		return(false);
295 	}
296 	ut_ad(m_ref_count == 1);
297 	m_tot_parts = part_info->get_tot_partitions();
298 	size_t	table_parts_size = sizeof(dict_table_t*) * m_tot_parts;
299 	m_table_parts = static_cast<dict_table_t**>(
300 		ut_zalloc(table_parts_size, mem_key_partitioning));
301 	if (m_table_parts == NULL) {
302 		m_ref_count--;
303 		return(true);
304 	}
305 
306 	/* Set up the array over all table partitions. */
307 	table_name_len = strlen(table_name);
308 	memcpy(partition_name, table_name, table_name_len);
309 	List_iterator<partition_element>
310 				part_it(part_info->partitions);
311 	partition_element*	part_elem;
312 	uint			i = 0;
313 
314 	while ((part_elem = part_it++)) {
315 		len = append_sep_and_name(
316 				partition_name + table_name_len,
317 				part_elem->partition_name,
318 				part_sep_nix,
319 				FN_REFLEN - table_name_len);
320 		if (part_info->is_sub_partitioned()) {
321 			List_iterator<partition_element>
322 				sub_it(part_elem->subpartitions);
323 			partition_element*	sub_elem;
324 			while ((sub_elem = sub_it++)) {
325 				append_sep_and_name(
326 					partition_name
327 					+ table_name_len + len,
328 					sub_elem->partition_name,
329 					sub_sep_nix,
330 					FN_REFLEN - table_name_len - len);
331 				if (open_one_table_part(i, partition_name)) {
332 					goto err;
333 				}
334 				i++;
335 			}
336 		} else {
337 			if (open_one_table_part(i, partition_name)) {
338 				goto err;
339 			}
340 			i++;
341 		}
342 	}
343 	ut_ad(i == m_tot_parts);
344 
345 	/* Create the mapping of mysql index number to innodb indexes. */
346 
347 	ib_num_index = (uint) UT_LIST_GET_LEN(m_table_parts[0]->indexes);
348 	mysql_num_index = part_info->table->s->keys;
349 
350 	/* If there exists inconsistency between MySQL and InnoDB dictionary
351 	(metadata) information, the number of index defined in MySQL
352 	could exceed that in InnoDB, do not build index translation
353 	table in such case. */
354 
355 	if (ib_num_index < mysql_num_index) {
356 		ut_ad(0);
357 		goto err;
358 	}
359 
360 	if (mysql_num_index != 0) {
361 		size_t	alloc_size = mysql_num_index * m_tot_parts
362 			* sizeof(*m_index_mapping);
363 		m_index_mapping = static_cast<dict_index_t**>(
364 			ut_zalloc(alloc_size, mem_key_partitioning));
365 		if (m_index_mapping == NULL) {
366 
367 			/* Report an error if index_mapping continues to be
368 			NULL and mysql_num_index is a non-zero value. */
369 
370 			ib::error() << "Failed to allocate memory for"
371 				" index translation table. Number of"
372 				" Index:" << mysql_num_index;
373 			goto err;
374 		}
375 	}
376 
377 	/* For each index in the mysql key_info array, fetch its
378 	corresponding InnoDB index pointer into index_mapping
379 	array. */
380 
381 	for (ulint idx = 0; idx < mysql_num_index; idx++) {
382 		for (ulint part = 0; part < m_tot_parts; part++) {
383 			ulint	count = part * mysql_num_index + idx;
384 
385 			/* Fetch index pointers into index_mapping according
386 			to mysql index sequence. */
387 
388 			m_index_mapping[count] = dict_table_get_index_on_name(
389 				m_table_parts[part],
390 				part_info->table->key_info[idx].name);
391 
392 			if (m_index_mapping[count] == NULL) {
393 				ib::error() << "Cannot find index `"
394 					<< part_info->table->key_info[idx].name
395 					<< "` in InnoDB index dictionary"
396 					" partition `"
397 					<< get_partition_name(part) << "`.";
398 				index_loaded = false;
399 				break;
400 			}
401 
402 			/* Double check fetched index has the same
403 			column info as those in mysql key_info. */
404 
405 			if (!innobase_match_index_columns(
406 					&part_info->table->key_info[idx],
407 					m_index_mapping[count])) {
408 				ib::error() << "Found index `"
409 					<< part_info->table->key_info[idx].name
410 					<< "` whose column info does not match"
411 					" that of MySQL.";
412 				index_loaded = false;
413 				break;
414 			}
415 		}
416 	}
417 	if (!index_loaded && m_index_mapping != NULL) {
418 		ut_free(m_index_mapping);
419 		m_index_mapping = NULL;
420 	}
421 
422 	/* Successfully built the translation table. */
423 	m_index_count = mysql_num_index;
424 
425 	return(false);
426 err:
427 	close_table_parts();
428 
429 	return(true);
430 }
431 
432 /** Close all partitions. */
433 void
close_table_parts()434 Ha_innopart_share::close_table_parts()
435 {
436 #ifndef NDEBUG
437 	if (m_table_share->tmp_table == NO_TMP_TABLE) {
438 		mysql_mutex_assert_owner(&m_table_share->LOCK_ha_data);
439 	}
440 #endif /* NDEBUG */
441 	m_ref_count--;
442 	if (m_ref_count != 0) {
443 
444 		/* Decrement dict_table_t reference count for all partitions */
445 		mutex_enter(&dict_sys->mutex);
446 		for (uint i = 0; i < m_tot_parts; i++) {
447 			dict_table_t*	table = m_table_parts[i];
448 			table->release();
449 			ut_ad(table->get_ref_count() >= m_ref_count);
450 		}
451 		mutex_exit(&dict_sys->mutex);
452 
453 		return;
454 	}
455 
456 	/* Last instance closed, close all table partitions and
457 	free the memory. */
458 
459 	mutex_enter(&dict_sys->mutex);
460 	if (m_table_parts != NULL) {
461 		for (uint i = 0; i < m_tot_parts; i++) {
462 			if (m_table_parts[i] != NULL) {
463 				dict_table_close(m_table_parts[i], TRUE, TRUE);
464 			}
465 		}
466 		ut_free(m_table_parts);
467 		m_table_parts = NULL;
468 	}
469 	mutex_exit(&dict_sys->mutex);
470 	if (m_index_mapping != NULL) {
471 		ut_free(m_index_mapping);
472 		m_index_mapping = NULL;
473 	}
474 
475 	m_tot_parts = 0;
476 	m_index_count = 0;
477 }
478 
479 /** Get index.
480 Find the index of the specified partition and key number.
481 @param[in]	part_id	Partition number.
482 @param[in]	keynr	Key number.
483 @return	Index pointer or NULL. */
484 inline
485 dict_index_t*
get_index(uint part_id,uint keynr)486 Ha_innopart_share::get_index(
487 	uint	part_id,
488 	uint	keynr)
489 {
490 	ut_a(part_id < m_tot_parts);
491 	ut_ad(keynr < m_index_count || keynr == MAX_KEY);
492 	if (m_index_mapping == NULL
493 	    || keynr >= m_index_count) {
494 
495 		if (keynr == MAX_KEY) {
496 			return(dict_table_get_first_index(
497 				get_table_part(part_id)));
498 		}
499 		return(NULL);
500 	}
501 	return(m_index_mapping[m_index_count * part_id + keynr]);
502 }
503 
504 /** Get MySQL key number corresponding to InnoDB index.
505 Calculates the key number used inside MySQL for an Innobase index. We will
506 first check the "index translation table" for a match of the index to get
507 the index number. If there does not exist an "index translation table",
508 or not able to find the index in the translation table, then we will fall back
509 to the traditional way of looping through dict_index_t list to find a
510 match. In this case, we have to take into account if we generated a
511 default clustered index for the table
512 @param[in]	part_id	Partition the index belongs to.
513 @param[in]	index	Index to return MySQL key number for.
514 @return	the key number used inside MySQL or UINT_MAX if key is not found. */
515 inline
516 uint
get_mysql_key(uint part_id,const dict_index_t * index)517 Ha_innopart_share::get_mysql_key(
518 	uint			part_id,
519 	const dict_index_t*	index)
520 {
521 	ut_ad(index != NULL);
522 	ut_ad(m_index_mapping != NULL);
523 	ut_ad(m_tot_parts);
524 
525 	if (index != NULL && m_index_mapping != NULL) {
526 		uint	start;
527 		uint	end;
528 
529 		if (part_id < m_tot_parts) {
530 			start = part_id * m_index_count;
531 			end = start + m_index_count;
532 		} else {
533 			start = 0;
534 			end = m_tot_parts * m_index_count;
535 		}
536 		for (uint i = start; i < end; i++) {
537 			if (m_index_mapping[i] == index) {
538 				return(i % m_index_count);
539 			}
540 		}
541 
542 		/* Print an error message if we cannot find the index
543 		in the "index translation table". */
544 
545 		if (index->is_committed()) {
546 			ib::error() << "Cannot find index "
547 				<< index->name
548 				<< " in InnoDB index translation table.";
549 		}
550 	}
551 
552 	return(UINT_MAX);
553 }
554 
555 /** Helper function for set bit in bitmap.
556 @param[in,out]	buf	Bitmap buffer to update bit in.
557 @param[in]	bit_pos	Bit number (index starts at 0). */
558 static
559 inline
560 void
set_bit(byte * buf,size_t pos)561 set_bit(
562 	byte*	buf,
563 	size_t	pos)
564 {
565 	buf[pos/8] |= (0x1 << (pos & 0x7));
566 }
567 
568 /** Helper function for clear bit in bitmap.
569 @param[in,out]	buf	Bitmap buffer to update bit in.
570 @param[in]	bit_pos	Bit number (index starts at 0). */
571 static
572 inline
573 void
clear_bit(byte * buf,size_t pos)574 clear_bit(
575 	byte*	buf,
576 	size_t	pos)
577 {
578 	buf[pos/8] &= ~(0x1 << (pos & 0x7));
579 }
580 
581 /** Helper function for get bit in bitmap.
582 @param[in,out]	buf	Bitmap buffer.
583 @param[in]	bit_pos	Bit number (index starts at 0).
584 @return	byte set to 0x0 or 0x1.
585 @retval	0x0 bit not set.
586 @retval	0x1 bet set. */
587 static
588 inline
589 byte
get_bit(byte * buf,size_t pos)590 get_bit(
591 	byte*	buf,
592 	size_t	pos)
593 {
594 	return((buf[pos/8] >> (pos & 0x7)) & 0x1);
595 }
596 
597 /** Helper class for encapsulating new/altered partitions during
598 ADD/REORG/... PARTITION. */
599 class Altered_partitions
600 {
601 private:
602 	/** New partitions during ADD/REORG/... PARTITION. */
603 	dict_table_t**	m_new_table_parts;
604 
605 	/** Insert nodes per partition. */
606 	ins_node_t**	m_ins_nodes;
607 
608 	/** sql_stat_start per partition. */
609 	byte*		m_sql_stat_start;
610 
611 	/** Trx id per partition. */
612 	trx_id_t*	m_trx_ids;
613 
614 	/** Number of new partitions. */
615 	size_t		m_num_new_parts;
616 
617 	/** Only need to create the partitions (no open/lock). */
618 	bool		m_only_create;
619 
620 public:
621 	Altered_partitions(
622 		uint n_partitions,
623 		bool only_create);
624 
625 	~Altered_partitions();
626 
627 	bool
628 	initialize();
629 
630 	bool
only_create() const631 	only_create() const
632 	{
633 		return(m_only_create);
634 	}
635 
636 	/** Set currently used partition.
637 	@param[in]	new_part_id	Partition id to set.
638 	@param[in]	part	InnoDB table to use. */
639 	inline
640 	void
set_part(ulint new_part_id,dict_table_t * part)641 	set_part(
642 		ulint		new_part_id,
643 		dict_table_t*	part)
644 	{
645 		ut_ad(m_new_table_parts[new_part_id] == NULL);
646 		m_new_table_parts[new_part_id] = part;
647 		set_bit(m_sql_stat_start, new_part_id);
648 	}
649 
650 	/** Get lower level InnoDB table for partition.
651 	@param[in]	part_id	Partition id.
652 	@return Lower level InnoDB table for the partition id. */
653 	inline
654 	dict_table_t*
part(uint part_id) const655 	part(
656 		uint	part_id) const
657 	{
658 		ut_ad(part_id < m_num_new_parts);
659 		return(m_new_table_parts[part_id]);
660 	}
661 
662 	/** Set up prebuilt for using a specified partition.
663 	@param[in]	prebuilt	Prebuilt to update.
664 	@param[in]	new_part_id	Partition to use. */
665 	inline
666 	void
get_prebuilt(row_prebuilt_t * prebuilt,uint new_part_id) const667 	get_prebuilt(
668 		row_prebuilt_t*	prebuilt,
669 		uint		new_part_id) const
670 	{
671 		ut_ad(m_new_table_parts[new_part_id]);
672 		prebuilt->table = m_new_table_parts[new_part_id];
673 		prebuilt->ins_node = m_ins_nodes[new_part_id];
674 		prebuilt->trx_id = m_trx_ids[new_part_id];
675 		prebuilt->sql_stat_start = get_bit(m_sql_stat_start,
676 						new_part_id);
677 	}
678 
679 	/** Update cached values for a partition from prebuilt.
680 	@param[in]	prebuilt	Prebuilt to copy from.
681 	@param[in]	new_part_id	Partition id to copy. */
682 	inline
683 	void
set_from_prebuilt(row_prebuilt_t * prebuilt,uint new_part_id)684 	set_from_prebuilt(
685 		row_prebuilt_t*	prebuilt,
686 		uint		new_part_id)
687 	{
688 		ut_ad(m_new_table_parts[new_part_id] == prebuilt->table);
689 		m_ins_nodes[new_part_id] = prebuilt->ins_node;
690 		m_trx_ids[new_part_id] = prebuilt->trx_id;
691 		if (prebuilt->sql_stat_start == 0) {
692 			clear_bit(m_sql_stat_start, new_part_id);
693 		}
694 	}
695 };
696 
Altered_partitions(uint n_partitions,bool only_create)697 Altered_partitions::Altered_partitions(
698 		uint n_partitions,
699 		bool only_create)
700 		:
701 		m_new_table_parts(),
702 		m_ins_nodes(),
703 		m_sql_stat_start(),
704 		m_trx_ids(),
705 		m_num_new_parts(n_partitions),
706 		m_only_create(only_create)
707 	{}
708 
~Altered_partitions()709 Altered_partitions::~Altered_partitions()
710 {
711 	if (m_new_table_parts != NULL) {
712 		for (ulint i = 0; i < m_num_new_parts; i++) {
713 			if (m_new_table_parts[i] != NULL) {
714 				dict_table_close(m_new_table_parts[i],
715 					false, true);
716 			}
717 		}
718 		ut_free(m_new_table_parts);
719 		m_new_table_parts = NULL;
720 	}
721 	if (m_ins_nodes != NULL) {
722 		for (ulint i = 0; i < m_num_new_parts; i++) {
723 			if (m_ins_nodes[i] != NULL) {
724 				ins_node_t*	ins = m_ins_nodes[i];
725 				ut_ad(ins->select == NULL);
726 				que_graph_free_recursive(ins->select);
727 				ins->select = NULL;
728 				if (ins->entry_sys_heap != NULL) {
729 					mem_heap_free(ins->entry_sys_heap);
730 					ins->entry_sys_heap = NULL;
731 				}
732 			}
733 		}
734 		ut_free(m_ins_nodes);
735 		m_ins_nodes = NULL;
736 	}
737 	if (m_sql_stat_start != NULL) {
738 		ut_free(m_sql_stat_start);
739 		m_sql_stat_start = NULL;
740 	}
741 	if (m_trx_ids != NULL) {
742 		ut_free(m_trx_ids);
743 		m_trx_ids = NULL;
744 	}
745 }
746 
747 /** Initialize the object.
748 @return false on success else true. */
749 bool
initialize()750 Altered_partitions::initialize()
751 {
752 	size_t	alloc_size = sizeof(*m_new_table_parts) * m_num_new_parts;
753 	m_new_table_parts = static_cast<dict_table_t**>(
754 		ut_zalloc(alloc_size, mem_key_partitioning));
755 	if (m_new_table_parts == NULL) {
756 		return(true);
757 	}
758 
759 	alloc_size = sizeof(*m_ins_nodes) * m_num_new_parts;
760 	m_ins_nodes = static_cast<ins_node_t**>(
761 		ut_zalloc(alloc_size, mem_key_partitioning));
762 	if (m_ins_nodes == NULL) {
763 		ut_free(m_new_table_parts);
764 		m_new_table_parts = NULL;
765 		return(true);
766 	}
767 
768 	alloc_size = sizeof(*m_sql_stat_start)
769 		* UT_BITS_IN_BYTES(m_num_new_parts);
770 	m_sql_stat_start = static_cast<byte*>(
771 		ut_zalloc(alloc_size, mem_key_partitioning));
772 	if (m_sql_stat_start == NULL) {
773 		ut_free(m_new_table_parts);
774 		m_new_table_parts = NULL;
775 		ut_free(m_ins_nodes);
776 		m_ins_nodes = NULL;
777 		return(true);
778 	}
779 
780 	alloc_size = sizeof(*m_trx_ids) * m_num_new_parts;
781 	m_trx_ids = static_cast<trx_id_t*>(
782 		ut_zalloc(alloc_size, mem_key_partitioning));
783 	if (m_trx_ids == NULL) {
784 		ut_free(m_new_table_parts);
785 		m_new_table_parts = NULL;
786 		ut_free(m_ins_nodes);
787 		m_ins_nodes = NULL;
788 		ut_free(m_sql_stat_start);
789 		m_sql_stat_start = NULL;
790 		return(true);
791 	}
792 
793 	return(false);
794 }
795 
796 /** Construct ha_innopart handler.
797 @param[in]	hton		Handlerton.
798 @param[in]	table_arg	MySQL Table.
799 @return	a new ha_innopart handler. */
ha_innopart(handlerton * hton,TABLE_SHARE * table_arg)800 ha_innopart::ha_innopart(
801 	handlerton*	hton,
802 	TABLE_SHARE*	table_arg)
803 	:
804 	ha_innobase(hton, table_arg),
805 	Partition_helper(this),
806 	m_ins_node_parts(),
807 	m_upd_node_parts(),
808 	m_blob_heap_parts(),
809 	m_trx_id_parts(),
810 	m_row_read_type_parts(),
811 	m_sql_stat_start_parts(),
812 	m_pcur(),
813 	m_clust_pcur(),
814 	m_new_partitions()
815 {
816 	m_int_table_flags &=	~(HA_INNOPART_DISABLED_TABLE_FLAGS);
817 
818 	/* INNOBASE_SHARE is not used in ha_innopart.
819 	This also flags for ha_innobase that it is a partitioned table.
820 	And make it impossible to use legacy share functionality. */
821 
822 	m_share = NULL;
823 }
824 
825 /** Destruct ha_innopart handler. */
~ha_innopart()826 ha_innopart::~ha_innopart()
827 {}
828 
829 /** Returned supported alter table flags.
830 @param[in]	flags	Flags to support.
831 @return	Supported flags. */
832 uint
alter_table_flags(uint flags)833 ha_innopart::alter_table_flags(
834 	uint	flags)
835 {
836 	return(HA_PARTITION_FUNCTION_SUPPORTED | HA_FAST_CHANGE_PARTITION);
837 }
838 
839 /** Internally called for initializing auto increment value.
840 Only called from ha_innobase::discard_or_import_table_space()
841 and should not do anything, since it is ha_innopart will initialize
842 it on first usage. */
843 int
innobase_initialize_autoinc()844 ha_innopart::innobase_initialize_autoinc()
845 {
846 	ut_ad(0);
847 	return(0);
848 }
849 
850 /** Set the autoinc column max value.
851 This should only be called once from ha_innobase::open().
852 Therefore there's no need for a covering lock.
853 @param[in]	no_lock	Ignored!
854 @return	0 for success or error code. */
855 inline
856 int
initialize_auto_increment(bool)857 ha_innopart::initialize_auto_increment(
858 	bool	/* no_lock */)
859 {
860 	int		error = 0;
861 	ulonglong	auto_inc = 0;
862 	const Field*	field = table->found_next_number_field;
863 
864 #ifndef NDEBUG
865 	if (table_share->tmp_table == NO_TMP_TABLE)
866 	{
867 		mysql_mutex_assert_owner(m_part_share->auto_inc_mutex);
868 	}
869 #endif
870 
871 	/* Since a table can already be "open" in InnoDB's internal
872 	data dictionary, we only init the autoinc counter once, the
873 	first time the table is loaded. We can safely reuse the
874 	autoinc value from a previous MySQL open. */
875 
876 	if (m_part_share->auto_inc_initialized) {
877 		/* Already initialized, nothing to do. */
878 		return(0);
879 	}
880 
881 	if (field == NULL) {
882 		ib::info() << "Unable to determine the AUTOINC column name";
883 	}
884 
885 	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
886 		/* If the recovery level is set so high that writes
887 		are disabled we force the AUTOINC counter to 0
888 		value effectively disabling writes to the table.
889 		Secondly, we avoid reading the table in case the read
890 		results in failure due to a corrupted table/index.
891 
892 		We will not return an error to the client, so that the
893 		tables can be dumped with minimal hassle. If an error
894 		were returned in this case, the first attempt to read
895 		the table would fail and subsequent SELECTs would succeed. */
896 
897 	} else if (field == NULL) {
898 		/* This is a far more serious error, best to avoid
899 		opening the table and return failure. */
900 
901 		my_error(ER_AUTOINC_READ_FAILED, MYF(0));
902 		error = HA_ERR_AUTOINC_READ_FAILED;
903 	} else {
904 		dict_index_t*	index;
905 		const char*	col_name;
906 		ib_uint64_t	read_auto_inc;
907 		ib_uint64_t	max_auto_inc = 0;
908 		ulint		err;
909 		dict_table_t*	ib_table;
910 		ulonglong	col_max_value;
911 
912 		col_max_value = field->get_max_int_value();
913 
914 		update_thd(ha_thd());
915 
916 		col_name = field->field_name;
917 		for (uint part = 0; part < m_tot_parts; part++) {
918 			ib_table = m_part_share->get_table_part(part);
919 			dict_table_autoinc_lock(ib_table);
920 			read_auto_inc = dict_table_autoinc_read(ib_table);
921 			if (read_auto_inc != 0) {
922 				set_if_bigger(max_auto_inc, read_auto_inc);
923 				dict_table_autoinc_unlock(ib_table);
924 				continue;
925 			}
926 			/* Execute SELECT MAX(col_name) FROM TABLE; */
927 			index = m_part_share->get_index(
928 					part, table->s->next_number_index);
929 			err = row_search_max_autoinc(
930 				index, col_name, &read_auto_inc);
931 
932 			switch (err) {
933 			case DB_SUCCESS: {
934 				/* At the this stage we do not know the
935 				increment nor the offset,
936 				so use a default increment of 1. */
937 
938 				auto_inc = innobase_next_autoinc(
939 					read_auto_inc, 1, 1, 0, col_max_value);
940 				set_if_bigger(max_auto_inc, auto_inc);
941 				dict_table_autoinc_initialize(ib_table,
942 					auto_inc);
943 				break;
944 			}
945 			case DB_RECORD_NOT_FOUND:
946 				ib::error() << "MySQL and InnoDB data"
947 					" dictionaries are out of sync. Unable"
948 					" to find the AUTOINC column "
949 					<< col_name << " in the InnoDB table "
950 					<< index->table->name << ". We set the"
951 					" next AUTOINC column value to 0, in"
952 					" effect disabling the AUTOINC next"
953 					" value generation.";
954 
955 				ib::info() << "You can either set the next"
956 					" AUTOINC value explicitly using ALTER"
957 					" TABLE or fix the data dictionary by"
958 					" recreating the table.";
959 
960 				/* We want the open to succeed, so that the
961 				user can take corrective action. ie. reads
962 				should succeed but updates should fail. */
963 
964 				/* This will disable the AUTOINC generation. */
965 				auto_inc = 0;
966 				goto done;
967 			default:
968 				/* row_search_max_autoinc() should only return
969 				one of DB_SUCCESS or DB_RECORD_NOT_FOUND. */
970 
971 				ut_error;
972 			}
973 			dict_table_autoinc_unlock(ib_table);
974 		}
975 		auto_inc = max_auto_inc;
976 	}
977 
978 done:
979 	m_part_share->next_auto_inc_val = auto_inc;
980 	m_part_share->auto_inc_initialized = true;
981 	return(error);
982 }
983 
984 /** Opens a partitioned InnoDB table.
985 Initializes needed data and opens the table which already exists
986 in an InnoDB database.
987 @param[in]	name		Table name (db/tablename)
988 @param[in]	mode		Not used
989 @param[in]	test_if_locked	Not used
990 @return	0 or error number. */
991 int
open(const char * name,int,uint)992 ha_innopart::open(
993 	const char*	name,
994 	int		/*mode*/,
995 	uint		/*test_if_locked*/)
996 {
997 	dict_table_t*	ib_table;
998 	char		norm_name[FN_REFLEN];
999 	THD*		thd;
1000 
1001 	DBUG_ENTER("ha_innopart::open");
1002 
1003 	ut_ad(table);
1004 	if (m_part_info == NULL) {
1005 		/* Must be during ::clone()! */
1006 		ut_ad(table->part_info != NULL);
1007 		m_part_info = table->part_info;
1008 	}
1009 	thd = ha_thd();
1010 
1011 	/* Under some cases MySQL seems to call this function while
1012 	holding search latch(es). This breaks the latching order as
1013 	we acquire dict_sys->mutex below and leads to a deadlock. */
1014 
1015 	if (thd != NULL) {
1016 		innobase_release_temporary_latches(ht, thd);
1017 	}
1018 
1019 	normalize_table_name(norm_name, name);
1020 
1021 	m_user_thd = NULL;
1022 
1023 	/* Get the Ha_innopart_share from the TABLE_SHARE. */
1024 	lock_shared_ha_data();
1025 	m_part_share = static_cast<Ha_innopart_share*>(get_ha_share_ptr());
1026 	if (m_part_share == NULL) {
1027 		m_part_share = new (std::nothrow)
1028 				Ha_innopart_share(table_share);
1029 		if (m_part_share == NULL) {
1030 share_error:
1031 			unlock_shared_ha_data();
1032 			DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
1033 		}
1034 		set_ha_share_ptr(static_cast<Handler_share*>(m_part_share));
1035 	}
1036 	if (m_part_share->open_table_parts(m_part_info, name)
1037 	    || m_part_share->populate_partition_name_hash(m_part_info)) {
1038 		goto share_error;
1039 	}
1040 	if (m_part_share->auto_inc_mutex == NULL
1041 	    && table->found_next_number_field != NULL) {
1042 		if (m_part_share->init_auto_inc_mutex(table_share)) {
1043 			goto share_error;
1044 		}
1045 	}
1046 	unlock_shared_ha_data();
1047 
1048 	/* Will be allocated if it is needed in ::update_row(). */
1049 	m_upd_buf = NULL;
1050 	m_upd_buf_size = 0;
1051 
1052 	/* Get pointer to a table object in InnoDB dictionary cache. */
1053 	ib_table = m_part_share->get_table_part(0);
1054 
1055 	m_pcur_parts = NULL;
1056 	m_clust_pcur_parts = NULL;
1057 	m_pcur_map = NULL;
1058 
1059 	/* TODO: Handle mismatching #P# vs #p# in upgrading to new DD instead!
1060 	See bug#58406, The problem exists when moving partitioned tables
1061 	between Windows and Unix-like platforms. InnoDB always folds the name
1062 	on windows, partitioning never folds partition (and #P# separator).
1063 	I.e. non of it follows lower_case_table_names correctly :( */
1064 
1065 	if (open_partitioning(m_part_share))
1066 	{
1067 		close();
1068 		DBUG_RETURN(HA_ERR_INITIALIZATION);
1069 	}
1070 
1071 	/* Currently we track statistics for all partitions, but for
1072 	the secondary indexes we only use the biggest partition. */
1073 
1074 	for (uint part_id = 0; part_id < m_tot_parts; part_id++) {
1075 		innobase_copy_frm_flags_from_table_share(
1076 			m_part_share->get_table_part(part_id),
1077 			table->s);
1078 		dict_stats_init(m_part_share->get_table_part(part_id));
1079 	}
1080 
1081 	MONITOR_INC(MONITOR_TABLE_OPEN);
1082 
1083 	bool	no_tablespace;
1084 
1085 	/* TODO: Should we do this check for every partition during ::open()? */
1086 	/* TODO: refactor this in ha_innobase so it can increase code reuse. */
1087 	if (dict_table_is_discarded(ib_table)) {
1088 
1089 		ib_senderrf(thd,
1090 			IB_LOG_LEVEL_WARN, ER_TABLESPACE_DISCARDED,
1091 			table->s->table_name.str);
1092 
1093 		/* Allow an open because a proper DISCARD should have set
1094 		all the flags and index root page numbers to FIL_NULL that
1095 		should prevent any DML from running but it should allow DDL
1096 		operations. */
1097 
1098 		no_tablespace = false;
1099 
1100 	} else if (ib_table->file_unreadable) {
1101 
1102 		ib_senderrf(
1103 			thd, IB_LOG_LEVEL_WARN,
1104 			ER_TABLESPACE_MISSING, norm_name);
1105 
1106 		/* This means we have no idea what happened to the tablespace
1107 		file, best to play it safe. */
1108 
1109 		no_tablespace = true;
1110 	} else {
1111 		no_tablespace = false;
1112 	}
1113 
1114 	if (!thd_tablespace_op(thd) && no_tablespace) {
1115                 set_my_errno(ENOENT);
1116 		close();
1117 		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
1118 	}
1119 
1120 	m_prebuilt = row_create_prebuilt(ib_table, table->s->reclength);
1121 
1122 	m_prebuilt->default_rec = table->s->default_values;
1123 	ut_ad(m_prebuilt->default_rec);
1124 
1125 	assert(table != NULL);
1126 	m_prebuilt->m_mysql_table = table;
1127 	m_prebuilt->m_mysql_handler = this;
1128 
1129 	if (ib_table->n_v_cols > 0) {
1130 		mutex_enter(&dict_sys->mutex);
1131 		m_part_share->set_v_templ(table, ib_table, name);
1132 		mutex_exit(&dict_sys->mutex);
1133 	}
1134 
1135 	/* Looks like MySQL-3.23 sometimes has primary key number != 0. */
1136 	m_primary_key = table->s->primary_key;
1137 	key_used_on_scan = m_primary_key;
1138 
1139 	/* Allocate a buffer for a 'row reference'. A row reference is
1140 	a string of bytes of length ref_length which uniquely specifies
1141 	a row in our table. Note that MySQL may also compare two row
1142 	references for equality by doing a simple memcmp on the strings
1143 	of length ref_length! */
1144 
1145 	if (!row_table_got_default_clust_index(ib_table)) {
1146 
1147 		m_prebuilt->clust_index_was_generated = FALSE;
1148 
1149 		if (UNIV_UNLIKELY(m_primary_key >= MAX_KEY)) {
1150 			table_name_t table_name;
1151 			table_name.m_name = const_cast<char*>(name);
1152 			ib::error() << "Table " << table_name
1153 				<< " has a primary key in InnoDB data"
1154 				" dictionary, but not in MySQL!";
1155 
1156 			/* This mismatch could cause further problems
1157 			if not attended, bring this to the user's attention
1158 			by printing a warning in addition to log a message
1159 			in the errorlog. */
1160 
1161 			push_warning_printf(thd, Sql_condition::SL_WARNING,
1162 					    ER_NO_SUCH_INDEX,
1163 					    "Table %s has a"
1164 					    " primary key in InnoDB data"
1165 					    " dictionary, but not in"
1166 					    " MySQL!", name);
1167 
1168 			/* If m_primary_key >= MAX_KEY, its (m_primary_key)
1169 			value could be out of bound if continue to index
1170 			into key_info[] array. Find InnoDB primary index,
1171 			and assign its key_length to ref_length.
1172 			In addition, since MySQL indexes are sorted starting
1173 			with primary index, unique index etc., initialize
1174 			ref_length to the first index key length in
1175 			case we fail to find InnoDB cluster index.
1176 
1177 			Please note, this will not resolve the primary
1178 			index mismatch problem, other side effects are
1179 			possible if users continue to use the table.
1180 			However, we allow this table to be opened so
1181 			that user can adopt necessary measures for the
1182 			mismatch while still being accessible to the table
1183 			date. */
1184 
1185 			if (table->key_info == NULL) {
1186 				ut_ad(table->s->keys == 0);
1187 				ref_length = 0;
1188 			} else {
1189 				ref_length = table->key_info[0].key_length;
1190 			}
1191 
1192 			/* Find corresponding cluster index
1193 			key length in MySQL's key_info[] array. */
1194 
1195 			for (uint i = 0; i < table->s->keys; i++) {
1196 				dict_index_t*	index;
1197 				index = innopart_get_index(0, i);
1198 				if (dict_index_is_clust(index)) {
1199 					ref_length =
1200 						 table->key_info[i].key_length;
1201 				}
1202 			}
1203 			ut_a(ref_length);
1204 			ref_length += PARTITION_BYTES_IN_POS;
1205 		} else {
1206 			/* MySQL allocates the buffer for ref.
1207 			key_info->key_length includes space for all key
1208 			columns + one byte for each column that may be
1209 			NULL. ref_length must be as exact as possible to
1210 			save space, because all row reference buffers are
1211 			allocated based on ref_length. */
1212 
1213 			ref_length = table->key_info[m_primary_key].key_length;
1214 			ref_length += PARTITION_BYTES_IN_POS;
1215 		}
1216 	} else {
1217 		if (m_primary_key != MAX_KEY) {
1218 			table_name_t table_name;
1219 			table_name.m_name = const_cast<char*>(name);
1220 			ib::error() << "Table " << table_name
1221 				<< " has no primary key in InnoDB data"
1222 				" dictionary, but has one in MySQL! If you"
1223 				" created the table with a MySQL version <"
1224 				" 3.23.54 and did not define a primary key,"
1225 				" but defined a unique key with all non-NULL"
1226 				" columns, then MySQL internally treats that"
1227 				" key as the primary key. You can fix this"
1228 				" error by dump + DROP + CREATE + reimport"
1229 				" of the table.";
1230 
1231 			/* This mismatch could cause further problems
1232 			if not attended, bring this to the user attention
1233 			by printing a warning in addition to log a message
1234 			in the errorlog. */
1235 
1236 			push_warning_printf(thd, Sql_condition::SL_WARNING,
1237 					    ER_NO_SUCH_INDEX,
1238 					    "InnoDB: Table %s has no"
1239 					    " primary key in InnoDB data"
1240 					    " dictionary, but has one in"
1241 					    " MySQL!", name);
1242 		}
1243 
1244 		m_prebuilt->clust_index_was_generated = TRUE;
1245 
1246 		ref_length = DATA_ROW_ID_LEN;
1247 		ref_length += PARTITION_BYTES_IN_POS;
1248 
1249 		/* If we automatically created the clustered index, then
1250 		MySQL does not know about it, and MySQL must NOT be aware
1251 		of the index used on scan, to make it avoid checking if we
1252 		update the column of the index. That is why we assert below
1253 		that key_used_on_scan is the undefined value MAX_KEY.
1254 		The column is the row id in the automatical generation case,
1255 		and it will never be updated anyway. */
1256 
1257 		if (key_used_on_scan != MAX_KEY) {
1258 			table_name_t table_name;
1259 			table_name.m_name = const_cast<char*>(name);
1260 			ib::warn() << "Table " << table_name
1261 				<< " key_used_on_scan is "
1262 				<< key_used_on_scan << " even though there is"
1263 				" no primary key inside InnoDB.";
1264 		}
1265 	}
1266 
1267 	/* Index block size in InnoDB: used by MySQL in query optimization. */
1268 	stats.block_size = UNIV_PAGE_SIZE;
1269 
1270 	if (m_prebuilt->table != NULL) {
1271 		/* We update the highest file format in the system table
1272 		space, if this table has higher file format setting. */
1273 
1274 		trx_sys_file_format_max_upgrade(
1275 			(const char**) &innobase_file_format_max,
1276 			dict_table_get_format(m_prebuilt->table));
1277 	}
1278 
1279 	/* Only if the table has an AUTOINC column. */
1280 	if (m_prebuilt->table != NULL
1281 	    && !m_prebuilt->table->file_unreadable
1282 	    && table->found_next_number_field != NULL) {
1283 		int	error;
1284 
1285 		/* Since a table can already be "open" in InnoDB's internal
1286 		data dictionary, we only init the autoinc counter once, the
1287 		first time the table is loaded,
1288 		see ha_innopart::initialize_auto_increment.
1289 		We can safely reuse the autoinc value from a previous MySQL
1290 		open. */
1291 
1292 		lock_auto_increment();
1293 		error = initialize_auto_increment(false);
1294 		unlock_auto_increment();
1295 		if (error != 0) {
1296 			close();
1297 			DBUG_RETURN(error);
1298 		}
1299 	}
1300 
1301 #ifdef HA_INNOPART_SUPPORTS_FULLTEXT
1302 	/* Set plugin parser for fulltext index. */
1303 	for (uint i = 0; i < table->s->keys; i++) {
1304 		if (table->key_info[i].flags & HA_USES_PARSER) {
1305 			dict_index_t*	index = innobase_get_index(i);
1306 			plugin_ref	parser = table->key_info[i].parser;
1307 
1308 			ut_ad(index->type & DICT_FTS);
1309 			index->parser =
1310 				static_cast<st_mysql_ftparser *>(
1311 					plugin_decl(parser)->info);
1312 
1313 			DBUG_EXECUTE_IF("fts_instrument_use_default_parser",
1314 				index->parser = &fts_default_parser;);
1315 		}
1316 	}
1317 #endif /* HA_INNOPART_SUPPORTS_FULLTEXT */
1318 
1319 	size_t	alloc_size = sizeof(*m_ins_node_parts) * m_tot_parts;
1320 	m_ins_node_parts = static_cast<ins_node_t**>(
1321 		ut_zalloc(alloc_size, mem_key_partitioning));
1322 
1323 	alloc_size = sizeof(*m_upd_node_parts) * m_tot_parts;
1324 	m_upd_node_parts = static_cast<upd_node_t**>(
1325 		ut_zalloc(alloc_size, mem_key_partitioning));
1326 
1327 	alloc_blob_heap_array();
1328 
1329 	alloc_size = sizeof(*m_trx_id_parts) * m_tot_parts;
1330 	m_trx_id_parts = static_cast<trx_id_t*>(
1331 		ut_zalloc(alloc_size, mem_key_partitioning));
1332 
1333 	alloc_size = sizeof(*m_row_read_type_parts) * m_tot_parts;
1334 	m_row_read_type_parts = static_cast<ulint*>(
1335 		ut_zalloc(alloc_size, mem_key_partitioning));
1336 
1337 	alloc_size = UT_BITS_IN_BYTES(m_tot_parts);
1338 	m_sql_stat_start_parts = static_cast<uchar*>(
1339 		ut_zalloc(alloc_size, mem_key_partitioning));
1340 	if (m_ins_node_parts == NULL
1341 	    || m_upd_node_parts == NULL
1342 	    || m_blob_heap_parts == NULL
1343 	    || m_trx_id_parts == NULL
1344 	    || m_row_read_type_parts == NULL
1345 	    || m_sql_stat_start_parts == NULL) {
1346 		close();  // Frees all the above.
1347 		DBUG_RETURN(HA_ERR_OUT_OF_MEM);
1348 	}
1349 	info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
1350 
1351 	DBUG_RETURN(0);
1352 }
1353 
1354 /** Get a cloned ha_innopart handler.
1355 @param[in]	name		Table name.
1356 @param[in]	mem_root	MySQL mem_root to use.
1357 @return	new ha_innopart handler. */
1358 handler*
clone(const char * name,MEM_ROOT * mem_root)1359 ha_innopart::clone(
1360 	const char*	name,
1361 	MEM_ROOT*	mem_root)
1362 {
1363 	ha_innopart*	new_handler;
1364 
1365 	DBUG_ENTER("ha_innopart::clone");
1366 
1367 	new_handler = dynamic_cast<ha_innopart*>(handler::clone(name,
1368 							mem_root));
1369 	if (new_handler != NULL) {
1370 		ut_ad(new_handler->m_prebuilt != NULL);
1371 
1372 		new_handler->m_prebuilt->select_lock_type =
1373 			m_prebuilt->select_lock_type;
1374 	}
1375 
1376 	DBUG_RETURN(new_handler);
1377 }
1378 
1379 /** Clear used ins_nodes and upd_nodes. */
clear_ins_upd_nodes()1380 void ha_innopart::clear_ins_upd_nodes()
1381 {
1382 	/* Free memory from insert nodes. */
1383 	if (m_ins_node_parts != NULL) {
1384 		for (uint i = 0; i < m_tot_parts; i++) {
1385 			if (m_ins_node_parts[i] != NULL) {
1386 				ins_node_t*	ins = m_ins_node_parts[i];
1387 				if (ins->select != NULL) {
1388 					que_graph_free_recursive(ins->select);
1389 					ins->select = NULL;
1390 				}
1391 
1392 				if (ins->entry_sys_heap != NULL) {
1393 					mem_heap_free(ins->entry_sys_heap);
1394 					ins->entry_sys_heap = NULL;
1395 				}
1396 				m_ins_node_parts[i] = NULL;
1397 			}
1398 		}
1399 	}
1400 
1401 	/* Free memory from update nodes. */
1402 	if (m_upd_node_parts != NULL) {
1403 		for (uint i = 0; i < m_tot_parts; i++) {
1404 			if (m_upd_node_parts[i] != NULL) {
1405 				upd_node_t*	upd = m_upd_node_parts[i];
1406 				if (upd->cascade_heap) {
1407 					mem_heap_free(upd->cascade_heap);
1408 					upd->cascade_heap = NULL;
1409 				}
1410 				if (upd->in_mysql_interface) {
1411 					btr_pcur_free_for_mysql(upd->pcur);
1412 					upd->in_mysql_interface = FALSE;
1413 				}
1414 
1415 				if (upd->select != NULL) {
1416 					que_graph_free_recursive(upd->select);
1417 					upd->select = NULL;
1418 				}
1419 				if (upd->heap != NULL) {
1420 					mem_heap_free(upd->heap);
1421 					upd->heap = NULL;
1422 				}
1423 				m_upd_node_parts[i] = NULL;
1424 			}
1425 		}
1426 	}
1427 }
1428 
1429 /** Closes a handle to an InnoDB table.
1430 @return	0 */
1431 int
close()1432 ha_innopart::close()
1433 {
1434 	THD*	thd;
1435 
1436 	DBUG_ENTER("ha_innopart::close");
1437 
1438 	thd = ha_thd();
1439 	if (thd != NULL) {
1440 		innobase_release_temporary_latches(ht, thd);
1441 	}
1442 
1443 	ut_ad(m_pcur_parts == NULL);
1444 	ut_ad(m_clust_pcur_parts == NULL);
1445 	close_partitioning();
1446 
1447 	ut_ad(m_part_share != NULL);
1448 	if (m_part_share != NULL) {
1449 		lock_shared_ha_data();
1450 		m_part_share->close_table_parts();
1451 		unlock_shared_ha_data();
1452 		m_part_share = NULL;
1453 	}
1454 	clear_ins_upd_nodes();
1455 	free_blob_heap_array();
1456 
1457 	/* Prevent double close of m_prebuilt->table. The real one was done
1458 	done in m_part_share->close_table_parts(). */
1459 	if (m_prebuilt != NULL) {
1460 		m_prebuilt->table = NULL;
1461 		row_prebuilt_free(m_prebuilt, FALSE);
1462         }
1463 
1464 	if (m_upd_buf != NULL) {
1465 		ut_ad(m_upd_buf_size != 0);
1466 		/* Allocated with my_malloc! */
1467 		my_free(m_upd_buf);
1468 		m_upd_buf = NULL;
1469 		m_upd_buf_size = 0;
1470 	}
1471 
1472 	if (m_ins_node_parts != NULL) {
1473 		ut_free(m_ins_node_parts);
1474 		m_ins_node_parts = NULL;
1475 	}
1476 	if (m_upd_node_parts != NULL) {
1477 		ut_free(m_upd_node_parts);
1478 		m_upd_node_parts = NULL;
1479 	}
1480 	if (m_trx_id_parts != NULL) {
1481 		ut_free(m_trx_id_parts);
1482 		m_trx_id_parts = NULL;
1483 	}
1484 	if (m_row_read_type_parts != NULL) {
1485 		ut_free(m_row_read_type_parts);
1486 		m_row_read_type_parts = NULL;
1487 	}
1488 	if (m_sql_stat_start_parts != NULL) {
1489 		ut_free(m_sql_stat_start_parts);
1490 		m_sql_stat_start_parts = NULL;
1491 	}
1492 
1493 	MONITOR_INC(MONITOR_TABLE_CLOSE);
1494 
1495 	/* Tell InnoDB server that there might be work for
1496 	utility threads: */
1497 
1498 	srv_active_wake_master_thread();
1499 
1500 	DBUG_RETURN(0);
1501 }
1502 
1503 /** Change active partition.
1504 Copies needed info into m_prebuilt from the partition specific memory.
1505 @param[in]	part_id	Partition to set as active. */
1506 void
set_partition(uint part_id)1507 ha_innopart::set_partition(
1508 	uint	part_id)
1509 {
1510 	DBUG_ENTER("ha_innopart::set_partition");
1511 
1512 	DBUG_PRINT("ha_innopart", ("partition id: %u", part_id));
1513 
1514 	if (part_id >= m_tot_parts) {
1515 		ut_ad(0);
1516 		DBUG_VOID_RETURN;
1517 	}
1518 	if (m_pcur_parts != NULL) {
1519 		m_prebuilt->pcur = &m_pcur_parts[m_pcur_map[part_id]];
1520 	}
1521 	if (m_clust_pcur_parts != NULL) {
1522 		m_prebuilt->clust_pcur =
1523 			&m_clust_pcur_parts[m_pcur_map[part_id]];
1524 	}
1525 	m_prebuilt->ins_node = m_ins_node_parts[part_id];
1526 	m_prebuilt->upd_node = m_upd_node_parts[part_id];
1527 
1528 	/* For unordered scan and table scan, use blob_heap from first
1529 	partition as we need exactly one blob. */
1530 	m_prebuilt->blob_heap = m_blob_heap_parts[m_ordered ? part_id : 0];
1531 
1532 #ifdef UNIV_DEBUG
1533 	if (m_prebuilt->blob_heap != NULL) {
1534 		DBUG_PRINT("ha_innopart", ("validating blob_heap: %p",
1535 					   m_prebuilt->blob_heap));
1536 		mem_heap_validate(m_prebuilt->blob_heap);
1537 	}
1538 #endif
1539 
1540 	m_prebuilt->trx_id = m_trx_id_parts[part_id];
1541 	m_prebuilt->row_read_type = m_row_read_type_parts[part_id];
1542 	m_prebuilt->sql_stat_start = get_bit(m_sql_stat_start_parts, part_id);
1543 	m_prebuilt->table = m_part_share->get_table_part(part_id);
1544 	m_prebuilt->index = innopart_get_index(part_id, active_index);
1545 
1546 	DBUG_VOID_RETURN;
1547 }
1548 
1549 /** Update active partition.
1550 Copies needed info from m_prebuilt into the partition specific memory.
1551 @param[in]	part_id	Partition to set as active. */
1552 void
update_partition(uint part_id)1553 ha_innopart::update_partition(
1554 	uint	part_id)
1555 {
1556 	DBUG_ENTER("ha_innopart::update_partition");
1557 	DBUG_PRINT("ha_innopart", ("partition id: %u", part_id));
1558 
1559 	if (part_id >= m_tot_parts) {
1560 		ut_ad(0);
1561 		DBUG_VOID_RETURN;
1562 	}
1563 	m_ins_node_parts[part_id] = m_prebuilt->ins_node;
1564 	m_upd_node_parts[part_id] = m_prebuilt->upd_node;
1565 
1566 #ifdef UNIV_DEBUG
1567 	if (m_prebuilt->blob_heap != NULL) {
1568 		DBUG_PRINT("ha_innopart", ("validating blob_heap: %p",
1569 					   m_prebuilt->blob_heap));
1570 		mem_heap_validate(m_prebuilt->blob_heap);
1571 	}
1572 #endif
1573 
1574 	/* For unordered scan and table scan, use blob_heap from first
1575 	partition as we need exactly one blob anytime. */
1576 	m_blob_heap_parts[m_ordered ? part_id : 0] = m_prebuilt->blob_heap;
1577 
1578 	m_trx_id_parts[part_id] = m_prebuilt->trx_id;
1579 	m_row_read_type_parts[part_id] = m_prebuilt->row_read_type;
1580 	if (m_prebuilt->sql_stat_start == 0) {
1581 		clear_bit(m_sql_stat_start_parts, part_id);
1582 	}
1583 	m_last_part = part_id;
1584 	DBUG_VOID_RETURN;
1585 }
1586 
1587 /** Save currently highest auto increment value.
1588 @param[in]	nr	Auto increment value to save. */
1589 void
save_auto_increment(ulonglong nr)1590 ha_innopart::save_auto_increment(
1591 	ulonglong	nr)
1592 {
1593 
1594 	/* Store it in the shared dictionary of the partition.
1595 	TODO: When the new DD is done, store it in the table and make it
1596 	persistent! */
1597 
1598 	dict_table_autoinc_lock(m_prebuilt->table);
1599 	dict_table_autoinc_update_if_greater(m_prebuilt->table, nr + 1);
1600 	dict_table_autoinc_unlock(m_prebuilt->table);
1601 }
1602 
1603 /** Was the last returned row semi consistent read.
1604 In an UPDATE or DELETE, if the row under the cursor was locked by
1605 another transaction, and the engine used an optimistic read of the last
1606 committed row value under the cursor, then the engine returns 1 from
1607 this function. MySQL must NOT try to update this optimistic value. If
1608 the optimistic value does not match the WHERE condition, MySQL can
1609 decide to skip over this row. This can be used to avoid unnecessary
1610 lock waits.
1611 
1612 If this method returns true, it will also signal the storage
1613 engine that the next read will be a locking re-read of the row.
1614 @see handler.h and row0mysql.h
1615 @return	true if last read was semi consistent else false. */
1616 bool
was_semi_consistent_read()1617 ha_innopart::was_semi_consistent_read()
1618 {
1619 	return(m_row_read_type_parts[m_last_part]
1620 		== ROW_READ_DID_SEMI_CONSISTENT);
1621 }
1622 
1623 /** Try semi consistent read.
1624 Tell the engine whether it should avoid unnecessary lock waits.
1625 If yes, in an UPDATE or DELETE, if the row under the cursor was locked
1626 by another transaction, the engine may try an optimistic read of
1627 the last committed row value under the cursor.
1628 @see handler.h and row0mysql.h
1629 @param[in]	yes	Should semi-consistent read be used. */
1630 void
try_semi_consistent_read(bool yes)1631 ha_innopart::try_semi_consistent_read(
1632 	bool	yes)
1633 {
1634 	ha_innobase::try_semi_consistent_read(yes);
1635 	for (uint i = m_part_info->get_first_used_partition();
1636 	     i < m_tot_parts;
1637 	     i = m_part_info->get_next_used_partition(i)) {
1638 
1639 		m_row_read_type_parts[i] = m_prebuilt->row_read_type;
1640 	}
1641 }
1642 
1643 /** Removes a lock on a row.
1644 Removes a new lock set on a row, if it was not read optimistically.
1645 This can be called after a row has been read in the processing of
1646 an UPDATE or a DELETE query. @see ha_innobase::unlock_row(). */
1647 void
unlock_row()1648 ha_innopart::unlock_row()
1649 {
1650 	ut_ad(m_last_part < m_tot_parts);
1651 	set_partition(m_last_part);
1652 	ha_innobase::unlock_row();
1653 	update_partition(m_last_part);
1654 }
1655 
1656 /** Write a row in partition.
1657 Stores a row in an InnoDB database, to the table specified in this
1658 handle.
1659 @param[in]	part_id	Partition to write to.
1660 @param[in]	record	A row in MySQL format.
1661 @return	0 or error code. */
1662 int
write_row_in_part(uint part_id,uchar * record)1663 ha_innopart::write_row_in_part(
1664 	uint	part_id,
1665 	uchar*	record)
1666 {
1667 	int	error;
1668 	Field*	saved_next_number_field = table->next_number_field;
1669 	DBUG_ENTER("ha_innopart::write_row_in_part");
1670 	set_partition(part_id);
1671 
1672 	/* Prevent update_auto_increment to be called
1673 	again in ha_innobase::write_row(). */
1674 
1675 	table->next_number_field = NULL;
1676 
1677 	/* TODO: try to avoid creating a new dtuple
1678 	(in row_get_prebuilt_insert_row()) for each partition).
1679 	Might be needed due to ins_node implementation. */
1680 
1681 	error = ha_innobase::write_row(record);
1682 	update_partition(part_id);
1683 	table->next_number_field = saved_next_number_field;
1684 	DBUG_RETURN(error);
1685 }
1686 
1687 /** Update a row in partition.
1688 Updates a row given as a parameter to a new value.
1689 @param[in]	part_id	Partition to update row in.
1690 @param[in]	old_row	Old row in MySQL format.
1691 @param[in]	new_row	New row in MySQL format.
1692 @return	0 or error number. */
1693 int
update_row_in_part(uint part_id,const uchar * old_row,uchar * new_row)1694 ha_innopart::update_row_in_part(
1695 	uint		part_id,
1696 	const uchar*	old_row,
1697 	uchar*		new_row)
1698 {
1699 	int	     error;
1700 	DBUG_ENTER("ha_innopart::update_row_in_part");
1701 
1702 	set_partition(part_id);
1703 	error = ha_innobase::update_row(old_row, new_row);
1704 	update_partition(part_id);
1705 	DBUG_RETURN(error);
1706 }
1707 
1708 /** Deletes a row in partition.
1709 @param[in]	part_id	Partition to delete from.
1710 @param[in]	record	Row to delete in MySQL format.
1711 @return	0 or error number. */
1712 int
delete_row_in_part(uint part_id,const uchar * record)1713 ha_innopart::delete_row_in_part(
1714 	uint		part_id,
1715 	const uchar*	record)
1716 {
1717 	int	error;
1718 	DBUG_ENTER("ha_innopart::delete_row_in_part");
1719 	m_err_rec = NULL;
1720 
1721 	m_last_part = part_id;
1722 	set_partition(part_id);
1723 	error = ha_innobase::delete_row(record);
1724 	update_partition(part_id);
1725 	DBUG_RETURN(error);
1726 }
1727 
1728 /** Initializes a handle to use an index.
1729 @param[in]	keynr	Key (index) number.
1730 @param[in]	sorted	True if result MUST be sorted according to index.
1731 @return	0 or error number. */
1732 int
index_init(uint keynr,bool sorted)1733 ha_innopart::index_init(
1734 	uint	keynr,
1735 	bool	sorted)
1736 {
1737 	int	error;
1738 	uint	part_id = m_part_info->get_first_used_partition();
1739 	DBUG_ENTER("ha_innopart::index_init");
1740 
1741 	active_index = keynr;
1742 	if (part_id == MY_BIT_NONE) {
1743 		DBUG_RETURN(0);
1744 	}
1745 
1746 	error = ph_index_init_setup(keynr, sorted);
1747 	if (error != 0) {
1748 		DBUG_RETURN(error);
1749 	}
1750 
1751 	if (sorted) {
1752 		error = init_record_priority_queue();
1753 		if (error != 0) {
1754 			/* Needs cleanup in case it returns error. */
1755 			destroy_record_priority_queue();
1756 			DBUG_RETURN(error);
1757 		}
1758 		/* Disable prefetch.
1759 		The prefetch buffer is not partitioning aware, so it may return
1760 		rows from a different partition if either the prefetch buffer is
1761 		full, or it is non-empty and the partition is exhausted. */
1762 		m_prebuilt->m_no_prefetch = true;
1763 	}
1764 
1765 	/* For scan across partitions, the keys needs to be materialized */
1766 	m_prebuilt->m_read_virtual_key = true;
1767 
1768 	error = change_active_index(part_id, keynr);
1769 	if (error != 0) {
1770 		destroy_record_priority_queue();
1771 		DBUG_RETURN(error);
1772 	}
1773 
1774 	DBUG_EXECUTE_IF("partition_fail_index_init", {
1775 		destroy_record_priority_queue();
1776 		DBUG_RETURN(HA_ERR_NO_PARTITION_FOUND);
1777 	});
1778 
1779 	DBUG_RETURN(0);
1780 }
1781 
1782 /** End index cursor.
1783 @return	0 or error code. */
1784 int
index_end()1785 ha_innopart::index_end()
1786 {
1787 	uint	part_id = m_part_info->get_first_used_partition();
1788 	DBUG_ENTER("ha_innopart::index_end");
1789 
1790 	if (part_id == MY_BIT_NONE) {
1791 		/* Never initialized any index. */
1792 		active_index = MAX_KEY;
1793 		DBUG_RETURN(0);
1794 	}
1795 	if (m_ordered) {
1796 		destroy_record_priority_queue();
1797 		m_prebuilt->m_no_prefetch = false;
1798 	}
1799 	m_prebuilt->m_read_virtual_key = false;
1800 
1801 	DBUG_RETURN(ha_innobase::index_end());
1802 }
1803 
1804 /* Partitioning support functions. */
1805 
1806 /** Setup the ordered record buffer and the priority queue.
1807 @param[in]	used_parts	Number of used partitions in query.
1808 @return	false for success else true. */
1809 int
init_record_priority_queue_for_parts(uint used_parts)1810 ha_innopart::init_record_priority_queue_for_parts(
1811 	uint	used_parts)
1812 {
1813 	size_t	alloc_size;
1814 	void*	buf;
1815 
1816 	DBUG_ENTER("ha_innopart::init_record_priority_queue_for_parts");
1817 	ut_ad(used_parts >= 1);
1818 	/* TODO: Don't use this if only one partition is used! */
1819 	//ut_ad(used_parts > 1);
1820 
1821 	/* We could reuse current m_prebuilt->pcur/clust_pcur for the first
1822 	used partition, but it would complicate and affect performance,
1823 	so we trade some extra memory instead. */
1824 
1825 	m_pcur = m_prebuilt->pcur;
1826 	m_clust_pcur = m_prebuilt->clust_pcur;
1827 
1828 	/* If we searching for secondary key or doing a write/update
1829 	we will need two pcur, one for the active (secondary) index and
1830 	one for the clustered index. */
1831 
1832 	bool	need_clust_index =
1833 			m_curr_key_info[1] != NULL
1834 			|| get_lock_type() != F_RDLCK;
1835 
1836 	/* pcur and clust_pcur per partition.
1837 	By using zalloc, we do not need to initialize the pcur's! */
1838 
1839 	alloc_size = used_parts * sizeof(btr_pcur_t);
1840 	if (need_clust_index) {
1841 		alloc_size *= 2;
1842 	}
1843 	buf = ut_zalloc(alloc_size, mem_key_partitioning);
1844 	if (buf == NULL) {
1845 		DBUG_RETURN(true);
1846 	}
1847 	m_pcur_parts = static_cast<btr_pcur_t*>(buf);
1848 	if (need_clust_index) {
1849 		m_clust_pcur_parts = &m_pcur_parts[used_parts];
1850 	}
1851 	/* mapping from part_id to pcur. */
1852 	alloc_size = m_tot_parts * sizeof(*m_pcur_map);
1853 	buf = ut_zalloc(alloc_size, mem_key_partitioning);
1854 	if (buf == NULL) {
1855 		DBUG_RETURN(true);
1856 	}
1857 	m_pcur_map = static_cast<uint16_t*>(buf);
1858 	{
1859 		uint16_t pcur_count = 0;
1860 		for (uint i = m_part_info->get_first_used_partition();
1861 		     i < m_tot_parts;
1862 		     i = m_part_info->get_next_used_partition(i)) {
1863 			m_pcur_map[i] = pcur_count++;
1864 		}
1865 	}
1866 
1867 	DBUG_RETURN(false);
1868 }
1869 
1870 /** Destroy the ordered record buffer and the priority queue. */
1871 inline
1872 void
destroy_record_priority_queue_for_parts()1873 ha_innopart::destroy_record_priority_queue_for_parts()
1874 {
1875 	DBUG_ENTER("ha_innopart::destroy_record_priority_queue");
1876 	if (m_pcur_parts != NULL) {
1877 		uint	used_parts;
1878 		used_parts = bitmap_bits_set(&m_part_info->read_partitions);
1879 		for (uint i = 0; i < used_parts; i++) {
1880 			btr_pcur_free(&m_pcur_parts[i]);
1881 			if (m_clust_pcur_parts != NULL) {
1882 				btr_pcur_free(&m_clust_pcur_parts[i]);
1883 			}
1884 		}
1885 		ut_free(m_pcur_parts);
1886 		m_clust_pcur_parts = NULL;
1887 		m_pcur_parts = NULL;
1888 		/* Reset the original m_prebuilt->pcur. */
1889 		m_prebuilt->pcur = m_pcur;
1890 		m_prebuilt->clust_pcur = m_clust_pcur;
1891 	}
1892 	if (m_pcur_map != NULL) {
1893 		ut_free(m_pcur_map);
1894 		m_pcur_map = NULL;
1895 	}
1896 	DBUG_VOID_RETURN;
1897 }
1898 
1899 /** Print error information.
1900 @param[in]	error	Error code (MySQL).
1901 @param[in]	errflag	Flags. */
1902 void
print_error(int error,myf errflag)1903 ha_innopart::print_error(
1904 	int	error,
1905 	myf	errflag)
1906 {
1907 	DBUG_ENTER("ha_innopart::print_error");
1908 	if (print_partition_error(error, errflag)) {
1909 		ha_innobase::print_error(error, errflag);
1910 	}
1911 
1912 	DBUG_VOID_RETURN;
1913 }
1914 
1915 /** Can error be ignored.
1916 @param[in]	error	Error code to check.
1917 @return	true if ignorable else false. */
1918 bool
is_ignorable_error(int error)1919 ha_innopart::is_ignorable_error(
1920 	int	error)
1921 {
1922 	if (ha_innobase::is_ignorable_error(error)
1923 	    || error == HA_ERR_NO_PARTITION_FOUND
1924 	    || error == HA_ERR_NOT_IN_LOCK_PARTITIONS) {
1925 
1926 		return(true);
1927 	}
1928 	return(false);
1929 }
1930 
1931 /** Get the index for the current partition
1932 @param[in]	keynr	MySQL index number.
1933 @return	InnoDB index or NULL. */
1934 inline
1935 dict_index_t*
innobase_get_index(uint keynr)1936 ha_innopart::innobase_get_index(
1937 	uint	keynr)
1938 {
1939 	uint	part_id = m_last_part;
1940 	if (part_id >= m_tot_parts) {
1941 		ut_ad(0);
1942 		part_id = 0;
1943 	}
1944 	return(innopart_get_index(part_id, keynr));
1945 }
1946 
1947 /** Get the index for a handle.
1948 Does not change active index.
1949 @param[in]	keynr	Use this index; MAX_KEY means always clustered index,
1950 even if it was internally generated by InnoDB.
1951 @param[in]	part_id	From this partition.
1952 @return	NULL or index instance. */
1953 inline
1954 dict_index_t*
innopart_get_index(uint part_id,uint keynr)1955 ha_innopart::innopart_get_index(
1956 	uint	part_id,
1957 	uint	keynr)
1958 {
1959 	KEY*		key = NULL;
1960 	dict_index_t*	index = NULL;
1961 
1962 	DBUG_ENTER("innopart_get_index");
1963 
1964 	if (keynr != MAX_KEY && table->s->keys > 0) {
1965 		key = table->key_info + keynr;
1966 
1967 		index = m_part_share->get_index(part_id, keynr);
1968 
1969 		if (index != NULL) {
1970 			ut_a(ut_strcmp(index->name, key->name) == 0);
1971 		} else {
1972 			/* Can't find index with keynr in the translation
1973 			table. Only print message if the index translation
1974 			table exists. */
1975 
1976 			ib::warn() << "InnoDB could not find index "
1977 				<< (key ? key->name : "NULL")
1978 				<< " key no " << keynr << " for table "
1979 				<< m_prebuilt->table->name
1980 				<< " through its index translation table";
1981 
1982 			index = dict_table_get_index_on_name(m_prebuilt->table,
1983 							     key->name);
1984 		}
1985 	} else {
1986 		/* Get the generated index. */
1987 		ut_ad(keynr == MAX_KEY);
1988 		index = dict_table_get_first_index(
1989 				m_part_share->get_table_part(part_id));
1990 	}
1991 
1992 	if (index == NULL) {
1993 		ib::error() << "InnoDB could not find key n:o "
1994 			<< keynr << " with name " << (key ? key->name : "NULL")
1995 			<< " from dict cache for table "
1996 			<< m_prebuilt->table->name << " partition n:o "
1997 			<< part_id;
1998 	}
1999 
2000 	DBUG_RETURN(index);
2001 }
2002 
2003 /** Changes the active index of a handle.
2004 @param[in]	part_id	Use this partition.
2005 @param[in]	keynr	Use this index; MAX_KEY means always clustered index,
2006 even if it was internally generated by InnoDB.
2007 @return	0 or error number. */
2008 int
change_active_index(uint part_id,uint keynr)2009 ha_innopart::change_active_index(
2010 	uint	part_id,
2011 	uint	keynr)
2012 {
2013 	DBUG_ENTER("ha_innopart::change_active_index");
2014 
2015 	ut_ad(m_user_thd == ha_thd());
2016 	ut_a(m_prebuilt->trx == thd_to_trx(m_user_thd));
2017 
2018 	active_index = keynr;
2019 	set_partition(part_id);
2020 
2021 	if (UNIV_UNLIKELY(m_prebuilt->index == NULL)) {
2022 		ib::warn() << "change_active_index(" << part_id
2023 			<< "," << keynr << ") failed";
2024 		m_prebuilt->index_usable = FALSE;
2025 		DBUG_RETURN(1);
2026 	}
2027 
2028 	m_prebuilt->index_usable = row_merge_is_index_usable(m_prebuilt->trx,
2029 							   m_prebuilt->index);
2030 
2031 	if (UNIV_UNLIKELY(!m_prebuilt->index_usable)) {
2032 		if (dict_index_is_corrupted(m_prebuilt->index)) {
2033 			char table_name[MAX_FULL_NAME_LEN + 1];
2034 
2035 			innobase_format_name(
2036 				table_name, sizeof table_name,
2037 				m_prebuilt->index->table->name.m_name);
2038 
2039 			push_warning_printf(
2040 				m_user_thd, Sql_condition::SL_WARNING,
2041 				HA_ERR_INDEX_CORRUPT,
2042 				"InnoDB: Index %s for table %s is"
2043 				" marked as corrupted"
2044 				" (partition %u)",
2045 				m_prebuilt->index->name(), table_name, part_id);
2046 			DBUG_RETURN(HA_ERR_INDEX_CORRUPT);
2047 		} else {
2048 			push_warning_printf(
2049 				m_user_thd, Sql_condition::SL_WARNING,
2050 				HA_ERR_TABLE_DEF_CHANGED,
2051 				"InnoDB: insufficient history for index %u",
2052 				keynr);
2053 		}
2054 
2055 		/* The caller seems to ignore this. Thus, we must check
2056 		this again in row_search_for_mysql(). */
2057 
2058 		DBUG_RETURN(HA_ERR_TABLE_DEF_CHANGED);
2059 	}
2060 
2061 	ut_a(m_prebuilt->search_tuple != NULL);
2062 
2063 	/* If too expensive, cache the keynr and only update search_tuple when
2064 	keynr changes. Remember that the clustered index is also used for
2065 	MAX_KEY. */
2066 	dtuple_set_n_fields(m_prebuilt->search_tuple,
2067 		m_prebuilt->index->n_fields);
2068 
2069 	dict_index_copy_types(m_prebuilt->search_tuple, m_prebuilt->index,
2070 			m_prebuilt->index->n_fields);
2071 
2072 	/* MySQL changes the active index for a handle also during some
2073 	queries, for example SELECT MAX(a), SUM(a) first retrieves the
2074 	MAX() and then calculates the sum. Previously we played safe
2075 	and used the flag ROW_MYSQL_WHOLE_ROW below, but that caused
2076 	unnecessary copying. Starting from MySQL-4.1 we use a more
2077 	efficient flag here. */
2078 
2079 	/* TODO: Is this really needed?
2080 	Will it not be built in index_read? */
2081 
2082 	build_template(false);
2083 
2084 	DBUG_RETURN(0);
2085 }
2086 
2087 /** Return first record in index from a partition.
2088 @param[in]	part	Partition to read from.
2089 @param[out]	record	First record in index in the partition.
2090 @return	error number or 0. */
2091 int
index_first_in_part(uint part,uchar * record)2092 ha_innopart::index_first_in_part(
2093 	uint	part,
2094 	uchar*	record)
2095 {
2096 	int	error;
2097 	DBUG_ENTER("ha_innopart::index_first_in_part");
2098 
2099 	set_partition(part);
2100 	error = ha_innobase::index_first(record);
2101 	update_partition(part);
2102 
2103 	DBUG_RETURN(error);
2104 }
2105 
2106 /** Return next record in index from a partition.
2107 @param[in]	part	Partition to read from.
2108 @param[out]	record	Last record in index in the partition.
2109 @return	error number or 0. */
2110 int
index_next_in_part(uint part,uchar * record)2111 ha_innopart::index_next_in_part(
2112 	uint	part,
2113 	uchar*	record)
2114 {
2115 	DBUG_ENTER("ha_innopart::index_next_in_part");
2116 
2117 	int	error;
2118 
2119 	set_partition(part);
2120 	error = ha_innobase::index_next(record);
2121 	update_partition(part);
2122 
2123 	ut_ad(m_ordered_scan_ongoing
2124 	      || m_ordered_rec_buffer == NULL
2125 	      || m_prebuilt->used_in_HANDLER
2126 	      || m_part_spec.start_part >= m_part_spec.end_part);
2127 
2128 	DBUG_RETURN(error);
2129 }
2130 
2131 /** Return next same record in index from a partition.
2132 This routine is used to read the next record, but only if the key is
2133 the same as supplied in the call.
2134 @param[in]	part	Partition to read from.
2135 @param[out]	record	Last record in index in the partition.
2136 @param[in]	key	Key to match.
2137 @param[in]	length	Length of key.
2138 @return	error number or 0. */
2139 int
index_next_same_in_part(uint part,uchar * record,const uchar * key,uint length)2140 ha_innopart::index_next_same_in_part(
2141 	uint		part,
2142 	uchar*		record,
2143 	const uchar*	key,
2144 	uint		length)
2145 {
2146 	int	error;
2147 
2148 	set_partition(part);
2149 	error = ha_innobase::index_next_same(record, key, length);
2150 	update_partition(part);
2151 	return(error);
2152 }
2153 
2154 /** Return last record in index from a partition.
2155 @param[in]	part	Partition to read from.
2156 @param[out]	record	Last record in index in the partition.
2157 @return	error number or 0. */
2158 int
index_last_in_part(uint part,uchar * record)2159 ha_innopart::index_last_in_part(
2160 	uint	part,
2161 	uchar*	record)
2162 {
2163 	int	error;
2164 
2165 	set_partition(part);
2166 	error = ha_innobase::index_last(record);
2167 	update_partition(part);
2168 	return(error);
2169 }
2170 
2171 /** Return previous record in index from a partition.
2172 @param[in]	part	Partition to read from.
2173 @param[out]	record	Last record in index in the partition.
2174 @return	error number or 0. */
2175 int
index_prev_in_part(uint part,uchar * record)2176 ha_innopart::index_prev_in_part(
2177 	uint	part,
2178 	uchar*	record)
2179 {
2180 	int	error;
2181 
2182 	set_partition(part);
2183 	error = ha_innobase::index_prev(record);
2184 	update_partition(part);
2185 
2186 	ut_ad(m_ordered_scan_ongoing
2187 	      || m_ordered_rec_buffer == NULL
2188 	      || m_prebuilt->used_in_HANDLER
2189 	      || m_part_spec.start_part >= m_part_spec.end_part);
2190 
2191 	return(error);
2192 }
2193 
2194 /** Start index scan and return first record from a partition.
2195 This routine starts an index scan using a start key. The calling
2196 function will check the end key on its own.
2197 @param[in]	part		Partition to read from.
2198 @param[out]	record		First matching record in index in the partition.
2199 @param[in]	key		Key to match.
2200 @param[in]	keypart_map	Which part of the key to use.
2201 @param[in]	find_flag	Key condition/direction to use.
2202 @return	error number or 0. */
2203 int
index_read_map_in_part(uint part,uchar * record,const uchar * key,key_part_map keypart_map,enum ha_rkey_function find_flag)2204 ha_innopart::index_read_map_in_part(
2205 	uint			part,
2206 	uchar*			record,
2207 	const uchar*		key,
2208 	key_part_map		keypart_map,
2209 	enum ha_rkey_function	find_flag)
2210 {
2211 	int	error;
2212 
2213 	set_partition(part);
2214 	error = ha_innobase::index_read_map(
2215 			record,
2216 			key,
2217 			keypart_map,
2218 			find_flag);
2219 	update_partition(part);
2220 	return(error);
2221 }
2222 
2223 /** Start index scan and return first record from a partition.
2224 This routine starts an index scan using a start key. The calling
2225 function will check the end key on its own.
2226 @param[in]	part		Partition to read from.
2227 @param[out]	record		First matching record in index in the partition.
2228 @param[in]	index		Index to read from.
2229 @param[in]	key		Key to match.
2230 @param[in]	keypart_map	Which part of the key to use.
2231 @param[in]	find_flag	Key condition/direction to use.
2232 @return	error number or 0. */
2233 int
index_read_idx_map_in_part(uint part,uchar * record,uint index,const uchar * key,key_part_map keypart_map,enum ha_rkey_function find_flag)2234 ha_innopart::index_read_idx_map_in_part(
2235 	uint			part,
2236 	uchar*			record,
2237 	uint			index,
2238 	const uchar*		key,
2239 	key_part_map		keypart_map,
2240 	enum ha_rkey_function	find_flag)
2241 {
2242 	int	error;
2243 
2244 	set_partition(part);
2245 	error = ha_innobase::index_read_idx_map(
2246 			record,
2247 			index,
2248 			key,
2249 			keypart_map,
2250 			find_flag);
2251 	update_partition(part);
2252 	return(error);
2253 }
2254 
2255 /** Return last matching record in index from a partition.
2256 @param[in]	part		Partition to read from.
2257 @param[out]	record		Last matching record in index in the partition.
2258 @param[in]	key		Key to match.
2259 @param[in]	keypart_map	Which part of the key to use.
2260 @return	error number or 0. */
2261 int
index_read_last_map_in_part(uint part,uchar * record,const uchar * key,key_part_map keypart_map)2262 ha_innopart::index_read_last_map_in_part(
2263 	uint		part,
2264 	uchar*		record,
2265 	const uchar*	key,
2266 	key_part_map	keypart_map)
2267 {
2268 	int	error;
2269 	set_partition(part);
2270 	error = ha_innobase::index_read_last_map(record, key, keypart_map);
2271 	update_partition(part);
2272 	return(error);
2273 }
2274 
2275 /** Start index scan and return first record from a partition.
2276 This routine starts an index scan using a start and end key.
2277 @param[in]	part		Partition to read from.
2278 @param[in,out]	record		First matching record in index in the partition,
2279 if NULL use table->record[0] as return buffer.
2280 @param[in]	start_key	Start key to match.
2281 @param[in]	end_key		End key to match.
2282 @param[in]	eq_range	Is equal range, start_key == end_key.
2283 @param[in]	sorted		Return rows in sorted order.
2284 @return	error number or 0. */
2285 int
read_range_first_in_part(uint part,uchar * record,const key_range * start_key,const key_range * end_key,bool eq_range,bool sorted)2286 ha_innopart::read_range_first_in_part(
2287 	uint			part,
2288 	uchar*			record,
2289 	const key_range*	start_key,
2290 	const key_range*	end_key,
2291 	bool			eq_range,
2292 	bool			sorted)
2293 {
2294 	int	error;
2295 	uchar*	read_record = record;
2296 	set_partition(part);
2297 	if (read_record == NULL) {
2298 		read_record = table->record[0];
2299 	}
2300 	if (m_start_key.key != NULL) {
2301 		error = ha_innobase::index_read(
2302 				read_record,
2303 				m_start_key.key,
2304 				m_start_key.length,
2305 				m_start_key.flag);
2306 	} else {
2307 		error = ha_innobase::index_first(read_record);
2308 	}
2309 	if (error == HA_ERR_KEY_NOT_FOUND) {
2310 		error = HA_ERR_END_OF_FILE;
2311 	} else if (error == 0 && !in_range_check_pushed_down) {
2312 		/* compare_key uses table->record[0], so we
2313 		need to copy the data if not already there. */
2314 
2315 		if (record != NULL) {
2316 			copy_cached_row(table->record[0], read_record);
2317 		}
2318 		if (compare_key(end_range) > 0) {
2319 			/* must use ha_innobase:: due to set/update_partition
2320 			could overwrite states if ha_innopart::unlock_row()
2321 			was used. */
2322 			ha_innobase::unlock_row();
2323 			error = HA_ERR_END_OF_FILE;
2324 		}
2325 	}
2326 	update_partition(part);
2327 	return(error);
2328 }
2329 
2330 /** Return next record in index range scan from a partition.
2331 @param[in]	part	Partition to read from.
2332 @param[in,out]	record	First matching record in index in the partition,
2333 if NULL use table->record[0] as return buffer.
2334 @return	error number or 0. */
2335 int
read_range_next_in_part(uint part,uchar * record)2336 ha_innopart::read_range_next_in_part(
2337 	uint	part,
2338 	uchar*	record)
2339 {
2340 	int	error;
2341 	uchar*	read_record = record;
2342 
2343 	set_partition(part);
2344 	if (read_record == NULL) {
2345 		read_record = table->record[0];
2346 	}
2347 
2348 	/* TODO: Implement ha_innobase::read_range*?
2349 	So it will return HA_ERR_END_OF_FILE or
2350 	HA_ERR_KEY_NOT_FOUND when passing end_range. */
2351 
2352 	error = ha_innobase::index_next(read_record);
2353 	if (error == 0 && !in_range_check_pushed_down) {
2354 		/* compare_key uses table->record[0], so we
2355 		need to copy the data if not already there. */
2356 
2357 		if (record != NULL) {
2358 			copy_cached_row(table->record[0], read_record);
2359 		}
2360 		if (compare_key(end_range) > 0) {
2361 			/* must use ha_innobase:: due to set/update_partition
2362 			could overwrite states if ha_innopart::unlock_row()
2363 			was used. */
2364 			ha_innobase::unlock_row();
2365 			error = HA_ERR_END_OF_FILE;
2366 		}
2367 	}
2368 	update_partition(part);
2369 
2370 	return(error);
2371 }
2372 
2373 /** Initialize a table scan in a specific partition.
2374 @param[in]	part_id	Partition to initialize.
2375 @param[in]	scan	True if table/index scan false otherwise (for rnd_pos)
2376 @return	0 or error number. */
2377 int
rnd_init_in_part(uint part_id,bool scan)2378 ha_innopart::rnd_init_in_part(
2379 	uint	part_id,
2380 	bool	scan)
2381 {
2382 	int	err;
2383 
2384 	if (m_prebuilt->clust_index_was_generated) {
2385 		err = change_active_index(part_id, MAX_KEY);
2386 	} else {
2387 		err = change_active_index(part_id, m_primary_key);
2388 	}
2389 
2390 	m_start_of_scan = 1;
2391 
2392 	/* Don't use semi-consistent read in random row reads (by position).
2393 	This means we must disable semi_consistent_read if scan is false. */
2394 
2395 	if (!scan) {
2396 		try_semi_consistent_read(false);
2397 	}
2398 
2399 	return(err);
2400 }
2401 
2402 /** Ends a table scan.
2403 @param[in]	part_id	Partition to end table scan in.
2404 @param[in]	scan	True for scan else random access.
2405 @return	0 or error number. */
2406 int
rnd_end_in_part(uint part_id,bool scan)2407 ha_innopart::rnd_end_in_part(
2408 	uint	part_id,
2409 	bool	scan)
2410 {
2411 	return(index_end());
2412 }
2413 
2414 /** Read next row in partition.
2415 Reads the next row in a table scan (also used to read the FIRST row
2416 in a table scan).
2417 @param[in]	part_id	Partition to end table scan in.
2418 @param[out]	buf	Returns the row in this buffer, in MySQL format.
2419 @return	0, HA_ERR_END_OF_FILE or error number. */
2420 int
rnd_next_in_part(uint part_id,uchar * buf)2421 ha_innopart::rnd_next_in_part(
2422 	uint	part_id,
2423 	uchar*	buf)
2424 {
2425 	int	error;
2426 
2427 	DBUG_ENTER("ha_innopart::rnd_next_in_part");
2428 
2429 	set_partition(part_id);
2430 	if (m_start_of_scan) {
2431 		error = ha_innobase::index_first(buf);
2432 
2433 		if (error == HA_ERR_KEY_NOT_FOUND) {
2434 			error = HA_ERR_END_OF_FILE;
2435 		}
2436 		m_start_of_scan = 0;
2437 	} else {
2438 		ha_statistic_increment(&SSV::ha_read_rnd_next_count);
2439 		error = ha_innobase::general_fetch(buf, ROW_SEL_NEXT, 0);
2440 	}
2441 
2442 	update_partition(part_id);
2443 	DBUG_RETURN(error);
2444 }
2445 
2446 /** Get a row from a position.
2447 Fetches a row from the table based on a row reference.
2448 @param[out]	buf	Returns the row in this buffer, in MySQL format.
2449 @param[in]	pos	Position, given as primary key value or DB_ROW_ID
2450 (if no primary key) of the row in MySQL format.  The length of data in pos has
2451 to be ref_length.
2452 @return	0, HA_ERR_KEY_NOT_FOUND or error code. */
2453 int
rnd_pos(uchar * buf,uchar * pos)2454 ha_innopart::rnd_pos(
2455 	uchar*	buf,
2456 	uchar*	pos)
2457 {
2458 	int	error;
2459 	uint	part_id;
2460 	DBUG_ENTER("ha_innopart::rnd_pos");
2461 	ut_ad(PARTITION_BYTES_IN_POS == 2);
2462 	DBUG_DUMP("pos", pos, ref_length);
2463 
2464 	ha_statistic_increment(&SSV::ha_read_rnd_count);
2465 
2466 	ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
2467 
2468 	/* Restore used partition. */
2469 	part_id = uint2korr(pos);
2470 
2471 	set_partition(part_id);
2472 
2473 	/* Note that we assume the length of the row reference is fixed
2474 	for the table, and it is == ref_length. */
2475 
2476 	error = ha_innobase::index_read(buf, pos + PARTITION_BYTES_IN_POS,
2477 				ref_length - PARTITION_BYTES_IN_POS,
2478 				HA_READ_KEY_EXACT);
2479 	DBUG_PRINT("info", ("part %u index_read returned %d", part_id, error));
2480 	DBUG_DUMP("buf", buf, table_share->reclength);
2481 
2482 	update_partition(part_id);
2483 
2484 	DBUG_RETURN(error);
2485 }
2486 
2487 /** Return position for cursor in last used partition.
2488 Stores a reference to the current row to 'ref' field of the handle. Note
2489 that in the case where we have generated the clustered index for the
2490 table, the function parameter is illogical: we MUST ASSUME that 'record'
2491 is the current 'position' of the handle, because if row ref is actually
2492 the row id internally generated in InnoDB, then 'record' does not contain
2493 it. We just guess that the row id must be for the record where the handle
2494 was positioned the last time.
2495 @param[out]	ref_arg	Pointer to buffer where to write the position.
2496 @param[in]	record	Record to position for. */
2497 void
position_in_last_part(uchar * ref_arg,const uchar * record)2498 ha_innopart::position_in_last_part(
2499 	uchar*		ref_arg,
2500 	const uchar*	record)
2501 {
2502 	if (m_prebuilt->clust_index_was_generated) {
2503 		/* No primary key was defined for the table and we
2504 		generated the clustered index from row id: the
2505 		row reference will be the row id, not any key value
2506 		that MySQL knows of. */
2507 
2508 		memcpy(ref_arg, m_prebuilt->row_id, DATA_ROW_ID_LEN);
2509 	} else {
2510 
2511 		/* Copy primary key as the row reference */
2512 		KEY*	key_info = table->key_info + m_primary_key;
2513 		key_copy(ref_arg, (uchar*)record, key_info,
2514 			 key_info->key_length);
2515 	}
2516 }
2517 
2518 /** Fill in data_dir_path and tablespace name from internal data
2519 dictionary.
2520 @param	part_elem	Partition element to fill.
2521 @param	ib_table	InnoDB table to copy from. */
2522 void
update_part_elem(partition_element * part_elem,dict_table_t * ib_table)2523 ha_innopart::update_part_elem(
2524 	partition_element*	part_elem,
2525 	dict_table_t*		ib_table)
2526 {
2527 	dict_get_and_save_data_dir_path(ib_table, false);
2528 	if (ib_table->data_dir_path != NULL) {
2529 		if (part_elem->data_file_name == NULL
2530 		    || strcmp(ib_table->data_dir_path,
2531 			part_elem->data_file_name) != 0) {
2532 
2533 			/* Play safe and allocate memory from TABLE and copy
2534 			instead of expose the internal data dictionary. */
2535 			part_elem->data_file_name =
2536 				strdup_root(&table->mem_root,
2537 					ib_table->data_dir_path);
2538 		}
2539 	} else {
2540 		part_elem->data_file_name = NULL;
2541 	}
2542 
2543 	part_elem->index_file_name = NULL;
2544 	dict_get_and_save_space_name(ib_table, false);
2545 	if (ib_table->tablespace != NULL) {
2546 		ut_ad(part_elem->tablespace_name == NULL
2547 		      || 0 == strcmp(part_elem->tablespace_name,
2548 				ib_table->tablespace));
2549 		if (part_elem->tablespace_name == NULL
2550 		    || strcmp(ib_table->tablespace,
2551 			part_elem->tablespace_name) != 0) {
2552 
2553 			/* Play safe and allocate memory from TABLE and copy
2554 			instead of expose the internal data dictionary. */
2555 			part_elem->tablespace_name =
2556 				strdup_root(&table->mem_root,
2557 					ib_table->tablespace);
2558 		}
2559 	}
2560 	else {
2561 		ut_ad(part_elem->tablespace_name == NULL
2562 		      || 0 == strcmp(part_elem->tablespace_name,
2563 				     "innodb_file_per_table"));
2564 		if (part_elem->tablespace_name != NULL
2565 		    && 0 != strcmp(part_elem->tablespace_name,
2566 				   "innodb_file_per_table")) {
2567 
2568 			/* Update part_elem tablespace to NULL same as in
2569 			innodb data dictionary ib_table. */
2570 			part_elem->tablespace_name = NULL;
2571 		}
2572 	}
2573 }
2574 
2575 /** Update create_info.
2576 Used in SHOW CREATE TABLE et al.
2577 @param[in,out]	create_info	Create info to update. */
2578 void
update_create_info(HA_CREATE_INFO * create_info)2579 ha_innopart::update_create_info(
2580 	HA_CREATE_INFO*	create_info)
2581 {
2582 	uint		num_subparts	= m_part_info->num_subparts;
2583 	uint		num_parts;
2584 	uint		part;
2585 	dict_table_t*	table;
2586 	List_iterator<partition_element>
2587 				part_it(m_part_info->partitions);
2588 	partition_element*	part_elem;
2589 	partition_element*	sub_elem;
2590 	DBUG_ENTER("ha_innopart::update_create_info");
2591 	if ((create_info->used_fields & HA_CREATE_USED_AUTO) == 0) {
2592 		info(HA_STATUS_AUTO);
2593 		create_info->auto_increment_value = stats.auto_increment_value;
2594 	}
2595 
2596 	num_parts = (num_subparts != 0) ? m_tot_parts / num_subparts : m_tot_parts;
2597 
2598 	/* DATA/INDEX DIRECTORY are never applied to the whole partitioned
2599 	table, only to its parts. */
2600 
2601 	create_info->data_file_name = NULL;
2602 	create_info->index_file_name = NULL;
2603 
2604 	/* Since update_create_info() can be called from
2605 	mysql_prepare_alter_table() when not all partitions are set up,
2606 	we look for that condition first.
2607 	If all partitions are not available then simply return,
2608 	since it does not need any updated partitioning info. */
2609 
2610 	if (!m_part_info->temp_partitions.is_empty()) {
2611 		DBUG_VOID_RETURN;
2612 	}
2613 	part = 0;
2614 	while ((part_elem = part_it++)) {
2615 		if (part >= num_parts) {
2616 			DBUG_VOID_RETURN;
2617 		}
2618 		if (m_part_info->is_sub_partitioned()) {
2619 			List_iterator<partition_element>
2620 				subpart_it(part_elem->subpartitions);
2621 			uint	subpart = 0;
2622 			while ((sub_elem = subpart_it++)) {
2623 				if (subpart >= num_subparts) {
2624 					DBUG_VOID_RETURN;
2625 				}
2626 				subpart++;
2627 			}
2628 			if (subpart != num_subparts) {
2629 				DBUG_VOID_RETURN;
2630 			}
2631 		}
2632 		part++;
2633 	}
2634 	if (part != num_parts) {
2635 		DBUG_VOID_RETURN;
2636 	}
2637 
2638 	/* part_elem->data_file_name and tablespace_name should be correct from
2639 	the .frm, but may have been changed, so update from SYS_DATAFILES.
2640 	index_file_name is ignored, so remove it. */
2641 
2642 	part = 0;
2643 	part_it.rewind();
2644 	while ((part_elem = part_it++)) {
2645 		if (m_part_info->is_sub_partitioned()) {
2646 			List_iterator<partition_element>
2647 				subpart_it(part_elem->subpartitions);
2648 			while ((sub_elem = subpart_it++)) {
2649 				table = m_part_share->get_table_part(part++);
2650 				update_part_elem(sub_elem, table);
2651 			}
2652 		} else {
2653 			table = m_part_share->get_table_part(part++);
2654 			update_part_elem(part_elem, table);
2655 		}
2656 	}
2657 	DBUG_VOID_RETURN;
2658 }
2659 
2660 /** Set create_info->data_file_name.
2661 @param[in]	part_elem	Partition to copy from.
2662 @param[in,out]	info		Create info to set. */
2663 static
2664 void
set_create_info_dir(partition_element * part_elem,HA_CREATE_INFO * info)2665 set_create_info_dir(
2666 	partition_element*	part_elem,
2667 	HA_CREATE_INFO*		info)
2668 {
2669 	if (part_elem->data_file_name != NULL
2670 	    && part_elem->data_file_name[0] != '\0') {
2671 		info->data_file_name = part_elem->data_file_name;
2672 		/* Also implies non-default tablespace. */
2673 		info->tablespace = NULL;
2674 	}
2675 	if (part_elem->index_file_name != NULL
2676 	    && part_elem->index_file_name[0] != '\0') {
2677 		info->index_file_name = part_elem->index_file_name;
2678 	}
2679 	if (part_elem->tablespace_name != NULL
2680 	    && part_elem->tablespace_name[0] != '\0') {
2681 		info->tablespace = part_elem->tablespace_name;
2682 	}
2683 }
2684 
2685 /** Set flags and append '/' to remote path if necessary. */
2686 void
set_remote_path_flags()2687 create_table_info_t::set_remote_path_flags()
2688 {
2689 	if (m_remote_path[0] != '\0') {
2690 		ut_ad(DICT_TF_HAS_DATA_DIR(m_flags) != 0);
2691 
2692 		/* os_file_make_remote_pathname will truncate
2693 		everything after the last '/', so append '/'
2694 		if it is not the last character. */
2695 
2696 		size_t len = strlen(m_remote_path);
2697 		if (m_remote_path[len - 1] != OS_PATH_SEPARATOR) {
2698 			m_remote_path[len] = OS_PATH_SEPARATOR;
2699 			m_remote_path[len + 1] = '\0';
2700 		}
2701 	} else {
2702 		ut_ad(DICT_TF_HAS_DATA_DIR(m_flags) == 0);
2703 	}
2704 }
2705 
2706 /** Creates a new table to an InnoDB database.
2707 @param[in]	name		Table name (in filesystem charset).
2708 @param[in]	form		MySQL Table containing information of
2709 partitions, columns and indexes etc.
2710 @param[in]	create_info	Additional create information, like
2711 create statement string.
2712 @return	0 or error number. */
2713 int
create(const char * name,TABLE * form,HA_CREATE_INFO * create_info)2714 ha_innopart::create(
2715 	const char*	name,
2716 	TABLE*		form,
2717 	HA_CREATE_INFO*	create_info)
2718 {
2719 	int		error;
2720 	/** {database}/{tablename} */
2721 	char		table_name[FN_REFLEN];
2722 	/** absolute path of temp frm */
2723 	char		temp_path[FN_REFLEN];
2724 	/** absolute path of table */
2725 	char		remote_path[FN_REFLEN];
2726 	char		partition_name[FN_REFLEN];
2727 	char		tablespace_name[NAME_LEN + 1];
2728 	char*		table_name_end;
2729 	size_t		table_name_len;
2730 	size_t		db_name_length;
2731 	ulint		stat_table_name_length;
2732 	char*		partition_name_start;
2733 	char		table_data_file_name[FN_REFLEN];
2734 	char		table_level_tablespace_name[NAME_LEN + 1];
2735 	const char*	index_file_name;
2736 	size_t		len;
2737 
2738 	create_table_info_t	info(ha_thd(),
2739 				     form,
2740 				     create_info,
2741 				     table_name,
2742 				     temp_path,
2743 				     remote_path,
2744 				     tablespace_name);
2745 
2746 	DBUG_ENTER("ha_innopart::create");
2747 
2748         if (is_shared_tablespace(create_info->tablespace)) {
2749 		push_deprecated_warn_no_replacement(
2750 			ha_thd(), PARTITION_IN_SHARED_TABLESPACE_WARNING);
2751         }
2752 
2753 	ut_ad(create_info != NULL);
2754 	ut_ad(m_part_info == form->part_info);
2755 	ut_ad(table_share != NULL);
2756 
2757 	/* Not allowed to create temporary partitioned tables. */
2758 	if (create_info != NULL
2759 	    && (create_info->options & HA_LEX_CREATE_TMP_TABLE) != 0) {
2760 		my_error(ER_PARTITION_NO_TEMPORARY, MYF(0));
2761 		ut_ad(0); // Can we support partitioned temporary tables?
2762 		DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
2763 	}
2764 
2765 	error = info.initialize();
2766 	if (error != 0) {
2767 		DBUG_RETURN(error);
2768 	}
2769 
2770 	/* Setup and check table level options. */
2771 	error = info.prepare_create_table(name);
2772 	if (error != 0) {
2773 		DBUG_RETURN(error);
2774 	}
2775 	ut_ad(temp_path[0] == '\0');
2776 	db_name_length = strchr(table_name,'/') - table_name;
2777 	strcpy(partition_name, table_name);
2778 	partition_name_start = partition_name + strlen(partition_name);
2779 	table_name_len = strlen(table_name);
2780 	table_name_end = table_name + table_name_len;
2781 	if (create_info->data_file_name != NULL) {
2782 		/* Strip the tablename from the path. */
2783 		strncpy(table_data_file_name, create_info->data_file_name,
2784 			FN_REFLEN-1);
2785 		table_data_file_name[FN_REFLEN - 1] = '\0';
2786 		char* ptr = strrchr(table_data_file_name, OS_PATH_SEPARATOR);
2787 		ut_ad(ptr != NULL);
2788 		if (ptr != NULL) {
2789 			ptr++;
2790 			*ptr = '\0';
2791 			create_info->data_file_name = table_data_file_name;
2792 		}
2793 	} else {
2794 		table_data_file_name[0] = '\0';
2795 	}
2796 	index_file_name = create_info->index_file_name;
2797 	if (create_info->tablespace != NULL) {
2798 		strcpy(table_level_tablespace_name, create_info->tablespace);
2799 	} else {
2800 		table_level_tablespace_name[0] = '\0';
2801 	}
2802 
2803 	info.allocate_trx();
2804 
2805 	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
2806 	or lock waits can happen in it during a table create operation.
2807 	Drop table etc. do this latching in row0mysql.cc. */
2808 
2809 	row_mysql_lock_data_dictionary(info.trx());
2810 
2811 	/* Mismatch can occur in the length of the column "table_name" in
2812 	mysql.innodb_table_stats and mysql.innodb_index_stats after the
2813 	fix to increase the column length of table_name column to accomdate
2814 	partition_names, so we first need to determine the length of the
2815 	"table_name" column and accordingly we can decide the length
2816 	of partition name .*/
2817 
2818 	dict_table_t *table = dict_table_get_low(TABLE_STATS_NAME);
2819 	if (table != NULL) {
2820 		ulint col_no = dict_table_has_column(table,"table_name",0);
2821 		ut_ad (col_no != table->n_def);
2822 		stat_table_name_length =  table->cols[col_no].len;
2823 		if (stat_table_name_length > NAME_LEN) {
2824 			/* The maximum allowed length is 597 bytes
2825 			,but the file name length cannot cross
2826 			FN_LEN */
2827 			stat_table_name_length = FN_LEN;
2828 		} else {
2829 			stat_table_name_length = NAME_LEN;
2830 		}
2831 
2832 	} else {
2833 		/* set the old length of 192 bytes in case of failure */
2834 		stat_table_name_length = NAME_LEN;
2835 		ib::warn() << TABLE_STATS_NAME << " doesnt exist.";
2836 	}
2837 
2838 	/* TODO: use the new DD tables instead to decrease duplicate info. */
2839 	List_iterator_fast <partition_element>
2840 		part_it(form->part_info->partitions);
2841 	partition_element* part_elem;
2842 	while ((part_elem = part_it++)) {
2843 		/* Append the partition name to the table name. */
2844 		len = Ha_innopart_share::append_sep_and_name(
2845 				partition_name_start,
2846 				part_elem->partition_name,
2847 				part_sep,
2848 				FN_REFLEN - table_name_len);
2849 		/* Report error if the partition name with path separator
2850 		exceeds maximum path length. */
2851 		if ((table_name_len + len + sizeof "/") >= FN_REFLEN) {
2852 			error = HA_ERR_INTERNAL_ERROR;
2853 			my_error(ER_IDENT_CAUSES_TOO_LONG_PATH, MYF(0), FN_REFLEN,
2854 				partition_name);
2855 			goto cleanup;
2856 		}
2857 
2858 		/* Report error if table name with partition name exceeds
2859 		maximum file name length */
2860 		if ((len + table_name_len - db_name_length - 1)
2861 		     > stat_table_name_length) {
2862 			error = HA_ERR_INTERNAL_ERROR;
2863 			my_error(ER_PATH_LENGTH, MYF(0),
2864 				 partition_name + db_name_length + 1 );
2865 			goto cleanup;
2866 		}
2867 
2868 		/* Override table level DATA/INDEX DIRECTORY. */
2869 		set_create_info_dir(part_elem, create_info);
2870 
2871 		if (!form->part_info->is_sub_partitioned()) {
2872 			if (is_shared_tablespace(part_elem->tablespace_name)) {
2873 				push_deprecated_warn_no_replacement(
2874 					ha_thd(), PARTITION_IN_SHARED_TABLESPACE_WARNING);
2875 			}
2876 
2877 			error = info.prepare_create_table(partition_name);
2878 			if (error != 0) {
2879 				goto cleanup;
2880 			}
2881 			info.set_remote_path_flags();
2882 			error = info.create_table();
2883 			if (error != 0) {
2884 				goto cleanup;
2885 			}
2886 		} else {
2887 			size_t	part_name_len = strlen(partition_name_start)
2888 						+ table_name_len;
2889 			char*	part_name_end = partition_name + part_name_len;
2890 			List_iterator_fast <partition_element>
2891 				sub_it(part_elem->subpartitions);
2892 			partition_element* sub_elem;
2893 
2894 			while ((sub_elem = sub_it++)) {
2895 				ut_ad(sub_elem->partition_name != NULL);
2896 
2897 				if (is_shared_tablespace(sub_elem->tablespace_name)) {
2898 					push_deprecated_warn_no_replacement(
2899 						ha_thd(), PARTITION_IN_SHARED_TABLESPACE_WARNING);
2900 				}
2901 
2902 				/* 'table' will be
2903 				<name>#P#<part_name>#SP#<subpart_name>.
2904 				Append the sub-partition name to
2905 				the partition name. */
2906 
2907 				len = Ha_innopart_share::append_sep_and_name(
2908 					part_name_end,
2909 					sub_elem->partition_name,
2910 					sub_sep,
2911 					FN_REFLEN - part_name_len);
2912 				/* Report error if the partition name with path separator
2913 				exceeds maximum path length. */
2914 				if ((len + part_name_len + sizeof "/") >= FN_REFLEN) {
2915 					error = HA_ERR_INTERNAL_ERROR;
2916 					my_error(ER_IDENT_CAUSES_TOO_LONG_PATH, MYF(0),
2917 						 FN_REFLEN,
2918 						 partition_name);
2919 					goto cleanup;
2920 				}
2921 
2922 				/* Report error if table name with partition
2923 				name exceeds maximum file name length */
2924 				if ((len + part_name_len - db_name_length -1)
2925 				     > stat_table_name_length ) {
2926 					error = HA_ERR_INTERNAL_ERROR;;
2927 					my_error(ER_PATH_LENGTH, MYF(0),
2928 					partition_name + db_name_length + 1);
2929 					goto cleanup;
2930 				}
2931 
2932 				/* Override part level DATA/INDEX DIRECTORY. */
2933 				set_create_info_dir(sub_elem, create_info);
2934 
2935 				Ha_innopart_share::partition_name_casedn_str(
2936 					part_name_end + 4);
2937 				error = info.prepare_create_table(partition_name);
2938 				if (error != 0) {
2939 					goto cleanup;
2940 				}
2941 				info.set_remote_path_flags();
2942 				error = info.create_table();
2943 				if (error != 0) {
2944 					goto cleanup;
2945 				}
2946 
2947 				/* Reset partition level
2948 				DATA/INDEX DIRECTORY. */
2949 
2950 				create_info->data_file_name =
2951 					table_data_file_name;
2952 				create_info->index_file_name =
2953 					index_file_name;
2954 				create_info->tablespace =
2955 					table_level_tablespace_name;
2956 				set_create_info_dir(part_elem, create_info);
2957 			}
2958 		}
2959 		/* Reset table level DATA/INDEX DIRECTORY. */
2960 		create_info->data_file_name = table_data_file_name;
2961 		create_info->index_file_name = index_file_name;
2962 		create_info->tablespace = table_level_tablespace_name;
2963 	}
2964 
2965 	innobase_commit_low(info.trx());
2966 
2967 	row_mysql_unlock_data_dictionary(info.trx());
2968 
2969 	/* Flush the log to reduce probability that the .frm files and
2970 	the InnoDB data dictionary get out-of-sync if the user runs
2971 	with innodb_flush_log_at_trx_commit = 0. */
2972 
2973 	log_buffer_flush_to_disk();
2974 
2975 	part_it.rewind();
2976 	/* No need to use these now, only table_name will be used. */
2977 	create_info->data_file_name = NULL;
2978 	create_info->index_file_name = NULL;
2979 	while ((part_elem = part_it++)) {
2980 		len = Ha_innopart_share::append_sep_and_name(
2981 				table_name_end,
2982 				part_elem->partition_name,
2983 				part_sep,
2984 				FN_REFLEN - table_name_len);
2985 
2986 		if (!form->part_info->is_sub_partitioned()) {
2987 			error = info.create_table_update_dict();
2988 			if (error != 0) {
2989 				ut_ad(0);
2990 				goto end;
2991 			}
2992 		} else {
2993 			size_t	part_name_len = strlen(table_name_end);
2994 			char*	part_name_end = table_name_end + part_name_len;
2995 			List_iterator_fast <partition_element>
2996 				sub_it(part_elem->subpartitions);
2997 			partition_element* sub_elem;
2998 			while ((sub_elem = sub_it++)) {
2999 				len = Ha_innopart_share::append_sep_and_name(
3000 						part_name_end,
3001 						sub_elem->partition_name,
3002 						sub_sep,
3003 						FN_REFLEN - table_name_len
3004 						- part_name_len);
3005 
3006 				error = info.create_table_update_dict();
3007 				if (error != 0) {
3008 					ut_ad(0);
3009 					goto end;
3010 				}
3011 			}
3012 		}
3013 	}
3014 
3015 end:
3016 	/* Tell the InnoDB server that there might be work for
3017 	utility threads: */
3018 
3019 	srv_active_wake_master_thread();
3020 
3021 	trx_free_for_mysql(info.trx());
3022 
3023 	DBUG_RETURN(error);
3024 
3025 cleanup:
3026     trx_rollback_for_mysql(info.trx());
3027 
3028     row_mysql_unlock_data_dictionary(info.trx());
3029 
3030     ulint dummy;
3031     char norm_name[FN_REFLEN];
3032 
3033     normalize_table_name(norm_name, name);
3034 
3035     uint lent = (uint)strlen(norm_name);
3036     ut_a(lent < FN_REFLEN);
3037     norm_name[lent] = '#';
3038     norm_name[lent + 1] = 0;
3039 
3040     row_drop_database_for_mysql(norm_name, info.trx(), &dummy);
3041 
3042     trx_free_for_mysql(info.trx());
3043     DBUG_RETURN(error);
3044 }
3045 
3046 /** Discards or imports an InnoDB tablespace.
3047 @param[in]	discard	True if discard, else import.
3048 @return	0 or error number. */
3049 int
discard_or_import_tablespace(my_bool discard)3050 ha_innopart::discard_or_import_tablespace(
3051 	my_bool	discard)
3052 {
3053 	int	error = 0;
3054 	uint	i;
3055 	DBUG_ENTER("ha_innopart::discard_or_import_tablespace");
3056 
3057 	for (i= m_part_info->get_first_used_partition();
3058 	     i < m_tot_parts;
3059 	     i= m_part_info->get_next_used_partition(i)) {
3060 
3061 		m_prebuilt->table = m_part_share->get_table_part(i);
3062 		error= ha_innobase::discard_or_import_tablespace(discard);
3063 		if (error != 0) {
3064 			break;
3065 		}
3066 	}
3067 	m_prebuilt->table = m_part_share->get_table_part(0);
3068 
3069 	/* IMPORT/DISCARD also means resetting auto_increment. Make sure
3070 	that auto_increment initialization is done after all partitions
3071 	are imported. */
3072 	if (table->found_next_number_field != NULL) {
3073 		lock_auto_increment();
3074 		m_part_share->next_auto_inc_val = 0;
3075 		m_part_share->auto_inc_initialized = false;
3076 		unlock_auto_increment();
3077 	}
3078 
3079 	DBUG_RETURN(error);
3080 }
3081 
3082 /** This function reads zip dict-related info from the base class.
3083 @param    thd          Thread handler
3084 @param    part_name    Must be always NULL.
3085 */
update_field_defs_with_zip_dict_info(THD * thd,const char * part_name)3086 void ha_innopart::update_field_defs_with_zip_dict_info(THD* thd,
3087 	const char* part_name)
3088 {
3089 	DBUG_ENTER("ha_innopart::update_field_defs_with_zip_dict_info");
3090 	char partition_name[FN_REFLEN];
3091 	bool res = get_first_partition_name(
3092 		thd, this, table_share->normalized_path.str,
3093 		table_share->partition_info_str,
3094 		table_share->partition_info_str_len, partition_name);
3095 	if (res)
3096 	{
3097 		ut_ad(0);
3098 		DBUG_VOID_RETURN;
3099 	}
3100 
3101 	ha_innobase::update_field_defs_with_zip_dict_info(thd, partition_name);
3102 	DBUG_VOID_RETURN;
3103 }
3104 
3105 /** Compare key and rowid.
3106 Helper function for sorting records in the priority queue.
3107 a/b points to table->record[0] rows which must have the
3108 key fields set. The bytes before a and b store the rowid.
3109 This is used for comparing/sorting rows first according to
3110 KEY and if same KEY, by rowid (ref).
3111 @param[in]	key_info	Null terminated array of index information.
3112 @param[in]	a		Pointer to record+ref in first record.
3113 @param[in]	b		Pointer to record+ref in second record.
3114 @return	Return value is SIGN(first_rec - second_rec)
3115 @retval	0	Keys are equal.
3116 @retval	-1	second_rec is greater than first_rec.
3117 @retval	+1	first_rec is greater than second_rec. */
3118 int
key_and_rowid_cmp(KEY ** key_info,uchar * a,uchar * b)3119 ha_innopart::key_and_rowid_cmp(
3120 	KEY**	key_info,
3121 	uchar	*a,
3122 	uchar	*b)
3123 {
3124 	int	cmp = key_rec_cmp(key_info, a, b);
3125 	if (cmp != 0) {
3126 		return(cmp);
3127 	}
3128 
3129 	/* We must compare by rowid, which is added before the record,
3130 	in the priority queue. */
3131 
3132 	return(memcmp(a - DATA_ROW_ID_LEN, b - DATA_ROW_ID_LEN,
3133 		DATA_ROW_ID_LEN));
3134 }
3135 
3136 /** Extra hints from MySQL.
3137 @param[in]	operation	Operation hint.
3138 @return	0 or error number. */
3139 int
extra(enum ha_extra_function operation)3140 ha_innopart::extra(
3141 	enum ha_extra_function	operation)
3142 {
3143 	if (operation == HA_EXTRA_SECONDARY_SORT_ROWID) {
3144 		/* index_init(sorted=true) must have been called! */
3145 		ut_ad(m_ordered);
3146 		ut_ad(m_ordered_rec_buffer != NULL);
3147 		/* No index_read call must have been done! */
3148 		ut_ad(m_queue->empty());
3149 
3150 		/* If not PK is set as secondary sort, do secondary sort by
3151 		rowid/ref. */
3152 
3153 		ut_ad(m_curr_key_info[1] != NULL
3154 		      || m_prebuilt->clust_index_was_generated != 0
3155 		      || m_curr_key_info[0]
3156 			 == table->key_info + table->s->primary_key);
3157 
3158 		if (m_curr_key_info[1] == NULL
3159 		    && m_prebuilt->clust_index_was_generated) {
3160 			m_ref_usage = Partition_helper::REF_USED_FOR_SORT;
3161 			m_queue->m_fun = key_and_rowid_cmp;
3162 		}
3163 		return(0);
3164 	}
3165 	return(ha_innobase::extra(operation));
3166 }
3167 
3168 /** Delete all rows in a partition.
3169 @return	0 or error number. */
3170 int
truncate_partition_low()3171 ha_innopart::truncate_partition_low()
3172 {
3173 	return(truncate());
3174 }
3175 
3176 /** Deletes all rows of a partitioned InnoDB table.
3177 @return	0 or error number. */
3178 int
truncate()3179 ha_innopart::truncate()
3180 {
3181 	dberr_t		err = DB_SUCCESS;
3182 	int		error;
3183 
3184 	DBUG_ENTER("ha_innopart::truncate");
3185 
3186 	if (high_level_read_only) {
3187 		DBUG_RETURN(HA_ERR_TABLE_READONLY);
3188 	}
3189 
3190 	/* TRUNCATE also means resetting auto_increment. Hence, reset
3191 	it so that it will be initialized again at the next use. */
3192 
3193 	if (table->found_next_number_field != NULL) {
3194 		lock_auto_increment();
3195 		m_part_share->next_auto_inc_val= 0;
3196 		m_part_share->auto_inc_initialized= false;
3197 		unlock_auto_increment();
3198 	}
3199 
3200 	/* Get the transaction associated with the current thd, or create one
3201 	if not yet created, and update m_prebuilt->trx. */
3202 
3203 	update_thd(ha_thd());
3204 
3205 	if (!trx_is_started(m_prebuilt->trx)) {
3206 		++m_prebuilt->trx->will_lock;
3207 	}
3208 	/* Truncate the table in InnoDB. */
3209 
3210 	for (uint i = m_part_info->get_first_used_partition();
3211 	     i < m_tot_parts;
3212 	     i = m_part_info->get_next_used_partition(i)) {
3213 
3214 		set_partition(i);
3215 		err = row_truncate_table_for_mysql(m_prebuilt->table,
3216 				m_prebuilt->trx);
3217 		update_partition(i);
3218 		if (err != DB_SUCCESS) {
3219 			break;
3220 		}
3221 	}
3222 
3223 	switch (err) {
3224 
3225 	case DB_TABLESPACE_DELETED:
3226 	case DB_TABLESPACE_NOT_FOUND:
3227 		ib_senderrf(
3228 			m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
3229 			(err == DB_TABLESPACE_DELETED ?
3230 			ER_TABLESPACE_DISCARDED : ER_TABLESPACE_MISSING),
3231 			table->s->table_name.str);
3232 		table->status = STATUS_NOT_FOUND;
3233 		error = HA_ERR_NO_SUCH_TABLE;
3234 		break;
3235 
3236 	default:
3237 		error = convert_error_code_to_mysql(
3238 			err, m_prebuilt->table->flags,
3239 			m_prebuilt->trx->mysql_thd);
3240 		table->status = STATUS_NOT_FOUND;
3241 		break;
3242 	}
3243 	DBUG_RETURN(error);
3244 }
3245 
3246 #ifdef WL6742
3247 
3248 /* Removing Wl6742 as part of Bug#23046302 */
3249 
3250 /** Total number of rows in all used partitions.
3251 Returns the exact number of records that this client can see using this
3252 handler object.
3253 @param[out]	num_rows	Number of rows.
3254 @return	0 or error number. */
3255 int
records(ha_rows * num_rows)3256 ha_innopart::records(
3257 	ha_rows*	num_rows)
3258 {
3259 	ha_rows	n_rows;
3260 	int	err;
3261 	DBUG_ENTER("ha_innopart::records()");
3262 
3263 	*num_rows = 0;
3264 
3265 	/* The index scan is probably so expensive, so the overhead
3266 	of the rest of the function is neglectable for each partition.
3267 	So no current reason for optimizing this further. */
3268 
3269 	for (uint i = m_part_info->get_first_used_partition();
3270 	     i < m_tot_parts;
3271 	     i = m_part_info->get_next_used_partition(i)) {
3272 
3273 		set_partition(i);
3274 		err = ha_innobase::records(&n_rows);
3275 		update_partition(i);
3276 		if (err != 0) {
3277 			*num_rows = HA_POS_ERROR;
3278 			DBUG_RETURN(err);
3279 		}
3280 		*num_rows += n_rows;
3281 	}
3282 	DBUG_RETURN(0);
3283 }
3284 #endif
3285 
3286 /** Estimates the number of index records in a range.
3287 @param[in]	keynr	Index number.
3288 @param[in]	min_key	Start key value (or NULL).
3289 @param[in]	max_key	End key value (or NULL).
3290 @return	estimated number of rows. */
3291 ha_rows
records_in_range(uint keynr,key_range * min_key,key_range * max_key)3292 ha_innopart::records_in_range(
3293 	uint		keynr,
3294 	key_range*	min_key,
3295 	key_range*	max_key)
3296 {
3297 	KEY*		key;
3298 	dict_index_t*	index;
3299 	dtuple_t*	range_start;
3300 	dtuple_t*	range_end;
3301 	int64_t		n_rows = 0;
3302 	page_cur_mode_t	mode1;
3303 	page_cur_mode_t	mode2;
3304 	mem_heap_t*	heap;
3305 	uint		part_id;
3306 
3307 	DBUG_ENTER("ha_innopart::records_in_range");
3308 	DBUG_PRINT("info", ("keynr %u min %p max %p", keynr, min_key, max_key));
3309 
3310 	ha_rows ret = innodb_records_in_range(ha_thd());
3311 	if (ret) {
3312 		DBUG_RETURN(ret);
3313 	}
3314 	if (table->force_index) {
3315 		const ha_rows force_rows = innodb_force_index_records_in_range(ha_thd());
3316 		if (force_rows) {
3317 			DBUG_RETURN(force_rows);
3318 		}
3319 	}
3320 
3321 	ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
3322 
3323 	m_prebuilt->trx->op_info = (char*)"estimating records in index range";
3324 
3325 	/* In case MySQL calls this in the middle of a SELECT query, release
3326 	possible adaptive hash latch to avoid deadlocks of threads. */
3327 
3328 	trx_search_latch_release_if_reserved(m_prebuilt->trx);
3329 
3330 	active_index = keynr;
3331 
3332 	key = table->key_info + active_index;
3333 
3334 	part_id = m_part_info->get_first_used_partition();
3335 	if (part_id == MY_BIT_NONE) {
3336 		DBUG_RETURN(0);
3337 	}
3338 	/* This also sets m_prebuilt->index! */
3339 	set_partition(part_id);
3340 	index = m_prebuilt->index;
3341 
3342 	/* There exists possibility of not being able to find requested
3343 	index due to inconsistency between MySQL and InoDB dictionary info.
3344 	Necessary message should have been printed in innopart_get_index(). */
3345 	if (index == NULL
3346 	    || dict_table_is_discarded(m_prebuilt->table)
3347 	    || !row_merge_is_index_usable(m_prebuilt->trx, index)) {
3348 
3349 		n_rows = HA_POS_ERROR;
3350 		goto func_exit;
3351 	}
3352 
3353 	heap = mem_heap_create(2 * (key->actual_key_parts * sizeof(dfield_t)
3354 				    + sizeof(dtuple_t)));
3355 
3356 	range_start = dtuple_create(heap, key->actual_key_parts);
3357 	dict_index_copy_types(range_start, index, key->actual_key_parts);
3358 
3359 	range_end = dtuple_create(heap, key->actual_key_parts);
3360 	dict_index_copy_types(range_end, index, key->actual_key_parts);
3361 
3362 	row_sel_convert_mysql_key_to_innobase(
3363 		range_start,
3364 		m_prebuilt->srch_key_val1,
3365 		m_prebuilt->srch_key_val_len,
3366 		index,
3367 		(byte*) (min_key ? min_key->key : (const uchar*) 0),
3368 		(ulint) (min_key ? min_key->length : 0),
3369 		m_prebuilt->trx);
3370 
3371 	ut_ad(min_key != NULL
3372 	      ? range_start->n_fields > 0
3373 	      : range_start->n_fields == 0);
3374 
3375 	row_sel_convert_mysql_key_to_innobase(
3376 		range_end,
3377 		m_prebuilt->srch_key_val2,
3378 		m_prebuilt->srch_key_val_len,
3379 		index,
3380 		(byte*) (max_key != NULL ? max_key->key : (const uchar*) 0),
3381 		(ulint) (max_key != NULL ? max_key->length : 0),
3382 		m_prebuilt->trx);
3383 
3384 	ut_ad(max_key != NULL
3385 	      ? range_end->n_fields > 0
3386 	      : range_end->n_fields == 0);
3387 
3388 	mode1 = convert_search_mode_to_innobase(min_key ? min_key->flag :
3389 						HA_READ_KEY_EXACT);
3390 	mode2 = convert_search_mode_to_innobase(max_key ? max_key->flag :
3391 						HA_READ_KEY_EXACT);
3392 
3393 	if (mode1 != PAGE_CUR_UNSUPP && mode2 != PAGE_CUR_UNSUPP) {
3394 
3395 		n_rows = btr_estimate_n_rows_in_range(index, range_start,
3396 						      mode1, range_end,
3397 						      mode2);
3398 		DBUG_PRINT("info", ("part_id %u rows %ld", part_id,
3399 					(long int) n_rows));
3400 		for (part_id = m_part_info->get_next_used_partition(part_id);
3401 		     part_id < m_tot_parts;
3402 		     part_id = m_part_info->get_next_used_partition(part_id)) {
3403 
3404 			index = m_part_share->get_index(part_id, keynr);
3405 			/* Individual partitions can be discarded
3406 			we need to check each partition */
3407 			if (index == NULL
3408 			    || dict_table_is_discarded(index->table)
3409 			    || !row_merge_is_index_usable(m_prebuilt->trx,index))
3410 			{
3411 
3412 				n_rows = HA_POS_ERROR;
3413 				mem_heap_free(heap);
3414 				goto func_exit;
3415 			}
3416 			int64_t n = btr_estimate_n_rows_in_range(index,
3417 							       range_start,
3418 							       mode1,
3419 							       range_end,
3420 							       mode2);
3421 			n_rows += n;
3422 			DBUG_PRINT("info", ("part_id %u rows %ld (%ld)",
3423 						part_id,
3424 						(long int) n,
3425 						(long int) n_rows));
3426 		}
3427 	} else {
3428 
3429 		n_rows = HA_POS_ERROR;
3430 	}
3431 
3432 	mem_heap_free(heap);
3433 
3434 func_exit:
3435 
3436 	m_prebuilt->trx->op_info = (char*)"";
3437 
3438 	/* The MySQL optimizer seems to believe an estimate of 0 rows is
3439 	always accurate and may return the result 'Empty set' based on that.
3440 	The accuracy is not guaranteed, and even if it were, for a locking
3441 	read we should anyway perform the search to set the next-key lock.
3442 	Add 1 to the value to make sure MySQL does not make the assumption! */
3443 
3444 	if (n_rows == 0) {
3445 		n_rows = 1;
3446 	}
3447 
3448 	DBUG_RETURN((ha_rows) n_rows);
3449 }
3450 
3451 /** Gives an UPPER BOUND to the number of rows in a table.
3452 This is used in filesort.cc.
3453 @return	upper bound of rows. */
3454 ha_rows
estimate_rows_upper_bound()3455 ha_innopart::estimate_rows_upper_bound()
3456 {
3457 	const dict_index_t*	index;
3458 	ulonglong		estimate = 0;
3459 	ulonglong		local_data_file_length;
3460 	ulint			stat_n_leaf_pages;
3461 
3462 	DBUG_ENTER("ha_innopart::estimate_rows_upper_bound");
3463 
3464 	/* We do not know if MySQL can call this function before calling
3465 	external_lock(). To be safe, update the thd of the current table
3466 	handle. */
3467 
3468 	update_thd(ha_thd());
3469 
3470 	m_prebuilt->trx->op_info = "calculating upper bound for table rows";
3471 
3472 	/* In case MySQL calls this in the middle of a SELECT query, release
3473 	possible adaptive hash latch to avoid deadlocks of threads. */
3474 
3475 	trx_search_latch_release_if_reserved(m_prebuilt->trx);
3476 
3477 	for (uint i = m_part_info->get_first_used_partition();
3478 	     i < m_tot_parts;
3479 	     i = m_part_info->get_next_used_partition(i)) {
3480 
3481 		m_prebuilt->table = m_part_share->get_table_part(i);
3482 		index = dict_table_get_first_index(m_prebuilt->table);
3483 
3484 		stat_n_leaf_pages = index->stat_n_leaf_pages;
3485 
3486 		ut_a(stat_n_leaf_pages > 0);
3487 
3488 		local_data_file_length =
3489 			((ulonglong) stat_n_leaf_pages) * UNIV_PAGE_SIZE;
3490 
3491 		/* Calculate a minimum length for a clustered index record
3492 		and from that an upper bound for the number of rows.
3493 		Since we only calculate new statistics in row0mysql.cc when a
3494 		table has grown by a threshold factor,
3495 		we must add a safety factor 2 in front of the formula below. */
3496 
3497 		estimate += 2 * local_data_file_length
3498 			/ dict_index_calc_min_rec_len(index);
3499 	}
3500 
3501 	m_prebuilt->trx->op_info = "";
3502 
3503 	DBUG_RETURN((ha_rows) estimate);
3504 }
3505 
3506 /** Time estimate for full table scan.
3507 How many seeks it will take to read through the table. This is to be
3508 comparable to the number returned by records_in_range so that we can
3509 decide if we should scan the table or use keys.
3510 @return	estimated time measured in disk seeks. */
3511 double
scan_time()3512 ha_innopart::scan_time()
3513 {
3514 	double	scan_time = 0.0;
3515 	DBUG_ENTER("ha_innopart::scan_time");
3516 
3517 	for (uint i = m_part_info->get_first_used_partition();
3518 	     i < m_tot_parts;
3519 	     i = m_part_info->get_next_used_partition(i)) {
3520 		m_prebuilt->table = m_part_share->get_table_part(i);
3521 		scan_time += ha_innobase::scan_time();
3522 	}
3523 	DBUG_RETURN(scan_time);
3524 }
3525 
3526 /** Updates the statistics for one partition (table).
3527 @param[in]	table		Table to update the statistics for.
3528 @param[in]	is_analyze	True if called from ::analyze().
3529 @return	error code. */
3530 static
3531 int
update_table_stats(dict_table_t * table,bool is_analyze)3532 update_table_stats(
3533 	dict_table_t*	table,
3534 	bool		is_analyze)
3535 {
3536 	dict_stats_upd_option_t	opt;
3537 	dberr_t			ret;
3538 
3539 	if (dict_stats_is_persistent_enabled(table)) {
3540 		if (is_analyze) {
3541 			opt = DICT_STATS_RECALC_PERSISTENT;
3542 		} else {
3543 			/* This is e.g. 'SHOW INDEXES',
3544 			fetch the persistent stats from disk. */
3545 			opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
3546 		}
3547 	} else {
3548 		opt = DICT_STATS_RECALC_TRANSIENT;
3549 	}
3550 
3551 	ut_ad(!mutex_own(&dict_sys->mutex));
3552 	ret = dict_stats_update(table, opt);
3553 
3554 	if (ret != DB_SUCCESS) {
3555 		return(HA_ERR_GENERIC);
3556 	}
3557 	return(0);
3558 }
3559 
3560 /** Updates and return statistics.
3561 Returns statistics information of the table to the MySQL interpreter,
3562 in various fields of the handle object.
3563 @param[in]	flag		Flags for what to update and return.
3564 @param[in]	is_analyze	True if called from ::analyze().
3565 @return	HA_ERR_* error code or 0. */
3566 int
info_low(uint flag,bool is_analyze)3567 ha_innopart::info_low(
3568 	uint	flag,
3569 	bool	is_analyze)
3570 {
3571 	dict_table_t*	ib_table;
3572 	ib_uint64_t	max_rows = 0;
3573 	uint		biggest_partition = 0;
3574 	int		error = 0;
3575 
3576 	DBUG_ENTER("ha_innopart::info_low");
3577 
3578 	/* If we are forcing recovery at a high level, we will suppress
3579 	statistics calculation on tables, because that may crash the
3580 	server if an index is badly corrupted. */
3581 
3582 	/* We do not know if MySQL can call this function before calling
3583 	external_lock(). To be safe, update the thd of the current table
3584 	handle. */
3585 
3586 	update_thd(ha_thd());
3587 
3588 	/* In case MySQL calls this in the middle of a SELECT query, release
3589 	possible adaptive hash latch to avoid deadlocks of threads. */
3590 
3591 	m_prebuilt->trx->op_info = (char*)"returning various info to MySQL";
3592 
3593 	trx_search_latch_release_if_reserved(m_prebuilt->trx);
3594 
3595 	ut_ad(m_part_share->get_table_part(0)->n_ref_count > 0);
3596 
3597 	if ((flag & HA_STATUS_TIME) != 0) {
3598 		stats.update_time = 0;
3599 
3600 		if (is_analyze) {
3601 			/* Only analyze the given partitions. */
3602 			int	error = set_altered_partitions();
3603 			if (error != 0) {
3604 				/* Already checked in mysql_admin_table! */
3605 				ut_ad(0);
3606 				DBUG_RETURN(error);
3607 			}
3608 		}
3609 		if (is_analyze || innobase_stats_on_metadata) {
3610 			m_prebuilt->trx->op_info = "updating table statistics";
3611 		}
3612 
3613 		/* TODO: Only analyze the PK for all partitions,
3614 		then the secondary indexes only for the largest partition! */
3615 		for (uint i = m_part_info->get_first_used_partition();
3616 		     i < m_tot_parts;
3617 		     i = m_part_info->get_next_used_partition(i)) {
3618 
3619 			ib_table = m_part_share->get_table_part(i);
3620 			if (is_analyze || innobase_stats_on_metadata) {
3621 				error = update_table_stats(ib_table, is_analyze);
3622 				if (error != 0) {
3623 					m_prebuilt->trx->op_info = "";
3624 					DBUG_RETURN(error);
3625 				}
3626 			}
3627 			set_if_bigger(stats.update_time,
3628 				(ulong) ib_table->update_time);
3629 		}
3630 
3631 		if (is_analyze || innobase_stats_on_metadata) {
3632 			m_prebuilt->trx->op_info =
3633 				"returning various info to MySQL";
3634 		}
3635 	}
3636 
3637 	if ((flag & HA_STATUS_VARIABLE) != 0) {
3638 
3639 		/* TODO: If this is called after pruning, then we could
3640 		also update the statistics according to the non-pruned
3641 		partitions, by allocating new rec_per_key on the TABLE,
3642 		instead of using the info from the TABLE_SHARE. */
3643 		ulint		stat_clustered_index_size = 0;
3644 		ulint		stat_sum_of_other_index_sizes = 0;
3645 		ib_uint64_t	n_rows = 0;
3646 		ulint		avail_space = 0;
3647 		bool		checked_sys_tablespace = false;
3648 
3649 		if ((flag & HA_STATUS_VARIABLE_EXTRA) != 0) {
3650 			stats.delete_length = 0;
3651 		}
3652 
3653 		for (uint i = m_part_info->get_first_used_partition();
3654 		     i < m_tot_parts;
3655 		     i = m_part_info->get_next_used_partition(i)) {
3656 
3657 			ib_table = m_part_share->get_table_part(i);
3658 			if ((flag & HA_STATUS_NO_LOCK) == 0) {
3659 				dict_table_stats_lock(ib_table, RW_S_LATCH);
3660 			}
3661 
3662 			ut_a(ib_table->stat_initialized);
3663 
3664 			n_rows += ib_table->stat_n_rows;
3665 			if (ib_table->stat_n_rows > max_rows) {
3666 				max_rows = ib_table->stat_n_rows;
3667 				biggest_partition = i;
3668 			}
3669 
3670 			stat_clustered_index_size +=
3671 				ib_table->stat_clustered_index_size;
3672 
3673 			stat_sum_of_other_index_sizes +=
3674 				ib_table->stat_sum_of_other_index_sizes;
3675 
3676 			if ((flag & HA_STATUS_NO_LOCK) == 0) {
3677 				dict_table_stats_unlock(ib_table, RW_S_LATCH);
3678 			}
3679 
3680 			if ((flag & HA_STATUS_VARIABLE_EXTRA) != 0
3681 			    && (flag & HA_STATUS_NO_LOCK) == 0
3682 			    && srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE
3683 			    && avail_space != ULINT_UNDEFINED) {
3684 
3685 				/* Only count system tablespace once! */
3686 				if (is_system_tablespace(ib_table->space)) {
3687 					if (checked_sys_tablespace) {
3688 						continue;
3689 					}
3690 					checked_sys_tablespace = true;
3691 				}
3692 
3693 				uintmax_t	space =
3694 					fsp_get_available_space_in_free_extents(
3695 						ib_table->space);
3696 				if (space == UINTMAX_MAX) {
3697 					THD*	thd = ha_thd();
3698 					const char* table_name
3699 						= ib_table->name.m_name;
3700 
3701 					push_warning_printf(
3702 						thd,
3703 						Sql_condition::SL_WARNING,
3704 						ER_CANT_GET_STAT,
3705 						"InnoDB: Trying to get the"
3706 						" free space for partition %s"
3707 						" but its tablespace has been"
3708 						" discarded or the .ibd file"
3709 						" is missing. Setting the free"
3710 						" space of the partition to"
3711 						" zero.",
3712 						ut_get_name(
3713 							m_prebuilt->trx,
3714 							table_name).c_str());
3715 				} else {
3716 					avail_space +=
3717 						static_cast<ulint>(space);
3718 				}
3719 			}
3720 		}
3721 
3722 		/*
3723 		The MySQL optimizer seems to assume in a left join that n_rows
3724 		is an accurate estimate if it is zero. Of course, it is not,
3725 		since we do not have any locks on the rows yet at this phase.
3726 		Since SHOW TABLE STATUS seems to call this function with the
3727 		HA_STATUS_TIME flag set, while the left join optimizer does not
3728 		set that flag, we add one to a zero value if the flag is not
3729 		set. That way SHOW TABLE STATUS will show the best estimate,
3730 		while the optimizer never sees the table empty. */
3731 
3732 		if (n_rows == 0 && (flag & HA_STATUS_TIME) == 0) {
3733 			n_rows++;
3734 		}
3735 
3736 		/* Fix bug#40386: Not flushing query cache after truncate.
3737 		n_rows can not be 0 unless the table is empty, set to 1
3738 		instead. The original problem of bug#29507 is actually
3739 		fixed in the server code. */
3740 		if (thd_sql_command(m_user_thd) == SQLCOM_TRUNCATE) {
3741 
3742 			n_rows = 1;
3743 
3744 			/* We need to reset the m_prebuilt value too, otherwise
3745 			checks for values greater than the last value written
3746 			to the table will fail and the autoinc counter will
3747 			not be updated. This will force write_row() into
3748 			attempting an update of the table's AUTOINC counter. */
3749 
3750 			m_prebuilt->autoinc_last_value = 0;
3751 		}
3752 
3753 		/* Take page_size from first partition. */
3754 		ib_table = m_part_share->get_table_part(0);
3755 		const page_size_t&	page_size =
3756 			dict_table_page_size(ib_table);
3757 
3758 		stats.records = (ha_rows) n_rows;
3759 		stats.deleted = 0;
3760 		stats.data_file_length =
3761 			((ulonglong) stat_clustered_index_size)
3762 			* page_size.physical();
3763 		stats.index_file_length =
3764 			((ulonglong) stat_sum_of_other_index_sizes)
3765 			* page_size.physical();
3766 
3767 		/* See ha_innobase::info_low() for comments! */
3768 		if ((flag & HA_STATUS_NO_LOCK) == 0
3769 		    && (flag & HA_STATUS_VARIABLE_EXTRA) != 0
3770 		    && srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) {
3771 			stats.delete_length = avail_space * 1024;
3772 		}
3773 
3774 		stats.check_time = 0;
3775 		stats.mrr_length_per_rec = ref_length + sizeof(void*)
3776 						- PARTITION_BYTES_IN_POS;
3777 
3778 		if (stats.records == 0) {
3779 			stats.mean_rec_length = 0;
3780 		} else {
3781 			stats.mean_rec_length = (ulong)
3782 				(stats.data_file_length / stats.records);
3783 		}
3784 	}
3785 
3786 	if ((flag & HA_STATUS_CONST) != 0) {
3787 		/* Find max rows and biggest partition. */
3788 		for (uint i = 0; i < m_tot_parts; i++) {
3789 			/* Skip partitions from above. */
3790 			if ((flag & HA_STATUS_VARIABLE) == 0
3791 			    || !bitmap_is_set(&(m_part_info->read_partitions),
3792 					i)) {
3793 
3794 				ib_table = m_part_share->get_table_part(i);
3795 				if (ib_table->stat_n_rows > max_rows) {
3796 					max_rows = ib_table->stat_n_rows;
3797 					biggest_partition = i;
3798 				}
3799 			}
3800 		}
3801 		ib_table = m_part_share->get_table_part(biggest_partition);
3802 		/* Verify the number of index in InnoDB and MySQL
3803 		matches up. If m_prebuilt->clust_index_was_generated
3804 		holds, InnoDB defines GEN_CLUST_INDEX internally. */
3805 		ulint	num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
3806 			- m_prebuilt->clust_index_was_generated;
3807 		if (table->s->keys < num_innodb_index) {
3808 			/* If there are too many indexes defined
3809 			inside InnoDB, ignore those that are being
3810 			created, because MySQL will only consider
3811 			the fully built indexes here. */
3812 
3813 			for (const dict_index_t* index =
3814 					UT_LIST_GET_FIRST(ib_table->indexes);
3815 			     index != NULL;
3816 			     index = UT_LIST_GET_NEXT(indexes, index)) {
3817 
3818 				/* First, online index creation is
3819 				completed inside InnoDB, and then
3820 				MySQL attempts to upgrade the
3821 				meta-data lock so that it can rebuild
3822 				the .frm file. If we get here in that
3823 				time frame, dict_index_is_online_ddl()
3824 				would not hold and the index would
3825 				still not be included in TABLE_SHARE. */
3826 				if (!index->is_committed()) {
3827 					num_innodb_index--;
3828 				}
3829 			}
3830 
3831 			if (table->s->keys < num_innodb_index
3832 			    && (innobase_fts_check_doc_id_index(ib_table,
3833 							NULL, NULL)
3834 				 == FTS_EXIST_DOC_ID_INDEX)) {
3835 				num_innodb_index--;
3836 			}
3837 		}
3838 
3839 		if (table->s->keys != num_innodb_index) {
3840 			ib::error() << "Table "
3841 				<< ib_table->name << " contains "
3842 				<< num_innodb_index
3843 				<< " indexes inside InnoDB, which"
3844 				" is different from the number of"
3845 				" indexes " << table->s->keys
3846 				<< " defined in the MySQL";
3847 		}
3848 
3849 		if ((flag & HA_STATUS_NO_LOCK) == 0) {
3850 			dict_table_stats_lock(ib_table, RW_S_LATCH);
3851 		}
3852 
3853 		ut_a(ib_table->stat_initialized);
3854 
3855 		for (ulong i = 0; i < table->s->keys; i++) {
3856 			ulong	j;
3857 			/* We could get index quickly through internal
3858 			index mapping with the index translation table.
3859 			The identity of index (match up index name with
3860 			that of table->key_info[i]) is already verified in
3861 			innopart_get_index(). */
3862 			dict_index_t*	index = innopart_get_index(
3863 							biggest_partition, i);
3864 
3865 			if (index == NULL) {
3866 				ib::error() << "Table "
3867 					<< ib_table->name << " contains fewer"
3868 					" indexes inside InnoDB than"
3869 					" are defined in the MySQL"
3870 					" .frm file. Have you mixed up"
3871 					" .frm files from different"
3872 					" installations? "
3873 					<< TROUBLESHOOTING_MSG;
3874 				break;
3875 			}
3876 
3877 			KEY*	key = &table->key_info[i];
3878 			for (j = 0;
3879 			     j < key->actual_key_parts;
3880 			     j++) {
3881 
3882 				if ((key->flags & HA_FULLTEXT) != 0) {
3883 					/* The whole concept has no validity
3884 					for FTS indexes. */
3885 					key->rec_per_key[j] = 1;
3886 					continue;
3887 				}
3888 
3889 				if ((j + 1) > index->n_uniq) {
3890 					ib::error() << "Index " << index->name
3891 						<< " of " << ib_table->name
3892 						<< " has " << index->n_uniq
3893 						<< " columns unique inside"
3894 						" InnoDB, but MySQL is"
3895 						" asking statistics for "
3896 						<< j + 1 << " columns. Have"
3897 						" you mixed up .frm files"
3898 						" from different"
3899 						" installations? "
3900 						<< TROUBLESHOOTING_MSG;
3901 					break;
3902 				}
3903 
3904 				/* innodb_rec_per_key() will use
3905 				index->stat_n_diff_key_vals[] and the value we
3906 				pass index->table->stat_n_rows. Both are
3907 				calculated by ANALYZE and by the background
3908 				stats gathering thread (which kicks in when too
3909 				much of the table has been changed). In
3910 				addition table->stat_n_rows is adjusted with
3911 				each DML (e.g. ++ on row insert). Those
3912 				adjustments are not MVCC'ed and not even
3913 				reversed on rollback. So,
3914 				index->stat_n_diff_key_vals[] and
3915 				index->table->stat_n_rows could have been
3916 				calculated at different time. This is
3917 				acceptable. */
3918 				const rec_per_key_t	rec_per_key =
3919 					innodb_rec_per_key(
3920 						index, j,
3921 						max_rows);
3922 
3923 				key->set_records_per_key(j, rec_per_key);
3924 
3925 				/* The code below is legacy and should be
3926 				removed together with this comment once we
3927 				are sure the new floating point rec_per_key,
3928 				set via set_records_per_key(), works fine. */
3929 
3930 				ulong	rec_per_key_int = static_cast<ulong>(
3931 					innodb_rec_per_key(index, j,
3932 							   max_rows));
3933 
3934 				/* Since MySQL seems to favor table scans
3935 				too much over index searches, we pretend
3936 				index selectivity is 2 times better than
3937 				our estimate: */
3938 
3939 				rec_per_key_int = rec_per_key_int / 2;
3940 
3941 				if (rec_per_key_int == 0) {
3942 					rec_per_key_int = 1;
3943 				}
3944 
3945 				key->rec_per_key[j] = rec_per_key_int;
3946 			}
3947 		}
3948 
3949 		if ((flag & HA_STATUS_NO_LOCK) == 0) {
3950 			dict_table_stats_unlock(ib_table, RW_S_LATCH);
3951 		}
3952 
3953 		char		path[FN_REFLEN];
3954 		os_file_stat_t	stat_info;
3955 		/* Use the first partition for create time until new DD. */
3956 		ib_table = m_part_share->get_table_part(0);
3957 		my_snprintf(path, sizeof(path), "%s/%s%s",
3958 			    mysql_data_home,
3959 			    table->s->normalized_path.str,
3960 			    reg_ext);
3961 
3962 		unpack_filename(path,path);
3963 
3964 		if (os_file_get_status(path, &stat_info, false, true) == DB_SUCCESS) {
3965 			stats.create_time = (ulong) stat_info.ctime;
3966 		}
3967 	}
3968 
3969 	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
3970 
3971 		goto func_exit;
3972 	}
3973 
3974 	if ((flag & HA_STATUS_ERRKEY) != 0) {
3975 		const dict_index_t*	err_index;
3976 
3977 		ut_a(m_prebuilt->trx);
3978 		ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
3979 
3980 		err_index = trx_get_error_index(m_prebuilt->trx);
3981 
3982 		if (err_index != NULL) {
3983 			errkey = m_part_share->get_mysql_key(m_last_part,
3984 							err_index);
3985 		} else {
3986 			errkey = (unsigned int) (
3987 				(m_prebuilt->trx->error_key_num
3988 				 == ULINT_UNDEFINED)
3989 					? UINT_MAX
3990 					: m_prebuilt->trx->error_key_num);
3991 		}
3992 	}
3993 
3994 	if ((flag & HA_STATUS_AUTO) != 0) {
3995 		/* auto_inc is only supported in first key for InnoDB! */
3996 		ut_ad(table_share->next_number_keypart == 0);
3997 		DBUG_PRINT("info", ("HA_STATUS_AUTO"));
3998 		if (table->found_next_number_field == NULL) {
3999 			stats.auto_increment_value = 0;
4000 		} else {
4001 			/* Lock to avoid two concurrent initializations. */
4002 			lock_auto_increment();
4003 			if (m_part_share->auto_inc_initialized) {
4004 				stats.auto_increment_value =
4005 					m_part_share->next_auto_inc_val;
4006 			} else {
4007 				/* The auto-inc mutex in the table_share is
4008 				locked, so we do not need to have the handlers
4009 				locked. */
4010 
4011 				error = initialize_auto_increment(
4012 					(flag & HA_STATUS_NO_LOCK) != 0);
4013 				stats.auto_increment_value =
4014 						m_part_share->next_auto_inc_val;
4015 			}
4016 			unlock_auto_increment();
4017 		}
4018 	}
4019 
4020 func_exit:
4021 	m_prebuilt->trx->op_info = (char*)"";
4022 
4023 	DBUG_RETURN(error);
4024 }
4025 
4026 /** Optimize table.
4027 This is mapped to "ALTER TABLE tablename ENGINE=InnoDB", which rebuilds
4028 the table in MySQL.
4029 @param[in]	thd		Connection thread handle.
4030 @param[in]	check_opt	Currently ignored.
4031 @return	0 for success else error code. */
4032 int
optimize(THD * thd,HA_CHECK_OPT * check_opt)4033 ha_innopart::optimize(
4034 	THD*		thd,
4035 	HA_CHECK_OPT*	check_opt)
4036 {
4037 	return(HA_ADMIN_TRY_ALTER);
4038 }
4039 
4040 /** Checks a partitioned table.
4041 Tries to check that an InnoDB table is not corrupted. If corruption is
4042 noticed, prints to stderr information about it. In case of corruption
4043 may also assert a failure and crash the server. Also checks for records
4044 in wrong partition.
4045 @param[in]	thd		MySQL THD object/thread handle.
4046 @param[in]	check_opt	Check options.
4047 @return	HA_ADMIN_CORRUPT or HA_ADMIN_OK. */
4048 int
check(THD * thd,HA_CHECK_OPT * check_opt)4049 ha_innopart::check(
4050 	THD*		thd,
4051 	HA_CHECK_OPT*	check_opt)
4052 {
4053 	uint	error = HA_ADMIN_OK;
4054 	uint	i;
4055 
4056 	DBUG_ENTER("ha_innopart::check");
4057 	/* TODO: Enhance this to:
4058 	- Every partition has the same structure.
4059 	- The names are correct (partition names checked in ::open()?)
4060 	Currently it only does normal InnoDB check of each partition. */
4061 
4062 	if (set_altered_partitions()) {
4063 		ut_ad(0);   // Already checked by set_part_state()!
4064 		DBUG_RETURN(HA_ADMIN_INVALID);
4065 	}
4066 	for (i = m_part_info->get_first_used_partition();
4067 	     i < m_tot_parts;
4068 	     i = m_part_info->get_next_used_partition(i)) {
4069 
4070 		m_prebuilt->table = m_part_share->get_table_part(i);
4071 		error = ha_innobase::check(thd, check_opt);
4072 		if (error != 0) {
4073 			break;
4074 		}
4075 		if ((check_opt->flags & (T_MEDIUM | T_EXTEND)) != 0) {
4076 			error = Partition_helper::check_misplaced_rows(i, false);
4077 			if (error != 0) {
4078 				break;
4079 			}
4080 		}
4081 	}
4082 	if (error != 0) {
4083 		print_admin_msg(
4084 			thd,
4085 			256,
4086 			"error",
4087 			table_share->db.str,
4088 			table->alias,
4089 			"check",
4090 			m_is_sub_partitioned ?
4091 			  "Subpartition %s returned error"
4092 			  : "Partition %s returned error",
4093 			m_part_share->get_partition_name(i));
4094 	}
4095 
4096 	DBUG_RETURN(error);
4097 }
4098 
4099 /** Repair a partitioned table.
4100 Only repairs records in wrong partitions (moves them to the correct
4101 partition or deletes them if not in any partition).
4102 @param[in]	thd		MySQL THD object/thread handle.
4103 @param[in]	repair_opt	Repair options.
4104 @return	0 or error code. */
4105 int
repair(THD * thd,HA_CHECK_OPT * repair_opt)4106 ha_innopart::repair(
4107 	THD*		thd,
4108 	HA_CHECK_OPT*	repair_opt)
4109 {
4110 	uint	error = HA_ADMIN_OK;
4111 
4112 	DBUG_ENTER("ha_innopart::repair");
4113 
4114 	/* TODO: enable this warning to be clear about what is repaired.
4115 	Currently disabled to generate smaller test diffs. */
4116 #ifdef ADD_WARNING_FOR_REPAIR_ONLY_PARTITION
4117 	push_warning_printf(thd, Sql_condition::SL_WARNING,
4118 			    ER_ILLEGAL_HA,
4119 			    "Only moving rows from wrong partition to correct"
4120 			    " partition is supported,"
4121 			    " repairing InnoDB indexes is not yet supported!");
4122 #endif
4123 
4124 	/* Only repair partitions for MEDIUM or EXTENDED options. */
4125 	if ((repair_opt->flags & (T_MEDIUM | T_EXTEND)) == 0) {
4126 		DBUG_RETURN(HA_ADMIN_OK);
4127 	}
4128 	if (set_altered_partitions()) {
4129 		ut_ad(0);   // Already checked by set_part_state()!
4130 		DBUG_RETURN(HA_ADMIN_INVALID);
4131 	}
4132 	for (uint i = m_part_info->get_first_used_partition();
4133 	     i < m_tot_parts;
4134 	     i = m_part_info->get_next_used_partition(i)) {
4135 
4136 		/* TODO: Implement and use ha_innobase::repair()! */
4137 		error = Partition_helper::check_misplaced_rows(i, true);
4138 		if (error != 0) {
4139 			print_admin_msg(
4140 				thd,
4141 				256,
4142 				"error",
4143 				table_share->db.str,
4144 				table->alias,
4145 				"repair",
4146 				m_is_sub_partitioned ?
4147 				  "Subpartition %s returned error"
4148 				  : "Partition %s returned error",
4149 				m_part_share->get_partition_name(i));
4150 			break;
4151 		}
4152 	}
4153 
4154 	DBUG_RETURN(error);
4155 }
4156 
4157 /** Check if possible to switch engine (no foreign keys).
4158 Checks if ALTER TABLE may change the storage engine of the table.
4159 Changing storage engines is not allowed for tables for which there
4160 are foreign key constraints (parent or child tables).
4161 @return	true if can switch engines. */
4162 bool
can_switch_engines()4163 ha_innopart::can_switch_engines()
4164 {
4165 	bool	can_switch;
4166 
4167 	DBUG_ENTER("ha_innopart::can_switch_engines");
4168 	can_switch = ha_innobase::can_switch_engines();
4169 	ut_ad(can_switch);
4170 
4171 	DBUG_RETURN(can_switch);
4172 }
4173 
4174 /** Checks if a table is referenced by a foreign key.
4175 The MySQL manual states that a REPLACE is either equivalent to an INSERT,
4176 or DELETE(s) + INSERT. Only a delete is then allowed internally to resolve
4177 a duplicate key conflict in REPLACE, not an update.
4178 @return	> 0 if referenced by a FOREIGN KEY. */
4179 uint
referenced_by_foreign_key()4180 ha_innopart::referenced_by_foreign_key()
4181 {
4182 	if (dict_table_is_referenced_by_foreign_key(m_prebuilt->table)) {
4183 
4184 #ifndef HA_INNOPART_SUPPORTS_FOREIGN_KEYS
4185 		ut_ad(0);
4186 #endif /* HA_INNOPART_SUPPORTS_FOREIGN_KEYS */
4187 		return(1);
4188 	}
4189 
4190 	return(0);
4191 }
4192 
4193 /** Start statement.
4194 MySQL calls this function at the start of each SQL statement inside LOCK
4195 TABLES. Inside LOCK TABLES the ::external_lock method does not work to
4196 mark SQL statement borders. Note also a special case: if a temporary table
4197 is created inside LOCK TABLES, MySQL has not called external_lock() at all
4198 on that table.
4199 MySQL-5.0 also calls this before each statement in an execution of a stored
4200 procedure. To make the execution more deterministic for binlogging, MySQL-5.0
4201 locks all tables involved in a stored procedure with full explicit table
4202 locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the
4203 procedure.
4204 @param[in]	thd		Handle to the user thread.
4205 @param[in]	lock_type	Lock type.
4206 @return	0 or error code. */
4207 int
start_stmt(THD * thd,thr_lock_type lock_type)4208 ha_innopart::start_stmt(
4209 	THD*		thd,
4210 	thr_lock_type	lock_type)
4211 {
4212 	int	error = 0;
4213 
4214 	if (m_part_info->get_first_used_partition() == MY_BIT_NONE) {
4215 		/* All partitions pruned away, do nothing! */
4216 		return(error);
4217 	}
4218 
4219 	error = ha_innobase::start_stmt(thd, lock_type);
4220 	if (m_prebuilt->sql_stat_start) {
4221 		memset(m_sql_stat_start_parts, 0xff,
4222 		       UT_BITS_IN_BYTES(m_tot_parts));
4223 	} else {
4224 		memset(m_sql_stat_start_parts, 0,
4225 		       UT_BITS_IN_BYTES(m_tot_parts));
4226 	}
4227 	return(error);
4228 }
4229 
4230 /** Function to store lock for all partitions in native partitioned table. Also
4231 look at ha_innobase::store_lock for more details.
4232 @param[in]	thd		user thread handle
4233 @param[in]	to		pointer to the current element in an array of
4234 pointers to lock structs
4235 @param[in]	lock_type	lock type to store in 'lock'; this may also be
4236 TL_IGNORE
4237 @retval	to	pointer to the current element in the 'to' array */
4238 THR_LOCK_DATA**
store_lock(THD * thd,THR_LOCK_DATA ** to,thr_lock_type lock_type)4239 ha_innopart::store_lock(
4240 	THD*			thd,
4241 	THR_LOCK_DATA**		to,
4242 	thr_lock_type		lock_type)
4243 {
4244 	trx_t*  trx = m_prebuilt->trx;
4245 	const uint sql_command = thd_sql_command(thd);
4246 
4247 	ha_innobase::store_lock(thd, to, lock_type);
4248 
4249 	if (sql_command == SQLCOM_FLUSH
4250 	    && lock_type == TL_READ_NO_INSERT) {
4251 		for (uint i = 1; i < m_tot_parts; i++) {
4252 			dict_table_t* table = m_part_share->get_table_part(i);
4253 
4254 			dberr_t err = row_quiesce_set_state(
4255 				table, QUIESCE_START, trx);
4256 			ut_a(err == DB_SUCCESS || err == DB_UNSUPPORTED);
4257 		}
4258 	}
4259 
4260 	return to;
4261 }
4262 
4263 /** Lock/prepare to lock table.
4264 As MySQL will execute an external lock for every new table it uses when it
4265 starts to process an SQL statement (an exception is when MySQL calls
4266 start_stmt for the handle) we can use this function to store the pointer to
4267 the THD in the handle. We will also use this function to communicate
4268 to InnoDB that a new SQL statement has started and that we must store a
4269 savepoint to our transaction handle, so that we are able to roll back
4270 the SQL statement in case of an error.
4271 @param[in]	thd		Handle to the user thread.
4272 @param[in]	lock_type	Lock type.
4273 @return	0 or error number. */
4274 int
external_lock(THD * thd,int lock_type)4275 ha_innopart::external_lock(
4276 	THD*	thd,
4277 	int	lock_type)
4278 {
4279 	int	error = 0;
4280 
4281 	if (m_part_info->get_first_used_partition() == MY_BIT_NONE
4282 		&& !(m_mysql_has_locked
4283 		     && lock_type == F_UNLCK)) {
4284 
4285 		/* All partitions pruned away, do nothing! */
4286 		ut_ad(!m_mysql_has_locked);
4287 		return(error);
4288 	}
4289 	ut_ad(m_mysql_has_locked || lock_type != F_UNLCK);
4290 
4291 	m_prebuilt->table = m_part_share->get_table_part(0);
4292 	error = ha_innobase::external_lock(thd, lock_type);
4293 
4294         for (uint i = 0; i < m_tot_parts; i++) {
4295 		dict_table_t* table = m_part_share->get_table_part(i);
4296 
4297 		switch (table->quiesce) {
4298 		case QUIESCE_START:
4299 			/* Check for FLUSH TABLE t WITH READ LOCK */
4300 			if (!srv_read_only_mode
4301 			    && thd_sql_command(thd) == SQLCOM_FLUSH
4302 			    && lock_type == F_RDLCK) {
4303 
4304 				ut_ad(table->quiesce == QUIESCE_START);
4305 
4306 				if (dict_table_is_discarded(table)) {
4307 					ib_senderrf(m_prebuilt->trx->mysql_thd,
4308 						    IB_LOG_LEVEL_ERROR,
4309 						    ER_TABLESPACE_DISCARDED,
4310 						    table->name.m_name);
4311 
4312 					return (HA_ERR_NO_SUCH_TABLE);
4313 				}
4314 
4315 				row_quiesce_table_start(table,
4316 							m_prebuilt->trx);
4317 
4318 				/* Use the transaction instance to track
4319 				UNLOCK TABLES. It can be done via START
4320 				TRANSACTION; too implicitly. */
4321 
4322 				++m_prebuilt->trx->flush_tables;
4323 			}
4324 			break;
4325 
4326 		case QUIESCE_COMPLETE:
4327 			/* Check for UNLOCK TABLES; implicit or explicit
4328 			or trx interruption. */
4329 			if (m_prebuilt->trx->flush_tables > 0
4330 			    && (lock_type == F_UNLCK
4331 				|| trx_is_interrupted(m_prebuilt->trx))) {
4332 
4333 				ut_ad(table->quiesce == QUIESCE_COMPLETE);
4334 				row_quiesce_table_complete(table,
4335 							   m_prebuilt->trx);
4336 
4337 				ut_a(m_prebuilt->trx->flush_tables > 0);
4338 				--m_prebuilt->trx->flush_tables;
4339 			}
4340 			break;
4341 
4342 		case QUIESCE_NONE:
4343 			break;
4344 
4345 		default:
4346 			ut_ad(0);
4347 		}
4348 	}
4349 
4350 	ut_ad(!m_auto_increment_lock);
4351 	ut_ad(!m_auto_increment_safe_stmt_log_lock);
4352 
4353 	if (m_prebuilt->sql_stat_start) {
4354 		memset(m_sql_stat_start_parts, 0xff,
4355 		       UT_BITS_IN_BYTES(m_tot_parts));
4356 	} else {
4357 		memset(m_sql_stat_start_parts, 0,
4358 		       UT_BITS_IN_BYTES(m_tot_parts));
4359 	}
4360 	return(error);
4361 }
4362 
4363 /** Get the current auto_increment value.
4364 @param[in]	offset			Table auto-inc offset.
4365 @param[in]	increment		Table auto-inc increment.
4366 @param[in]	nb_desired_values	Number of required values.
4367 @param[out]	first_value		The auto increment value.
4368 @param[out]	nb_reserved_values	Number of reserved values.
4369 @return	Auto increment value, or ~0 on failure. */
4370 void
get_auto_increment(ulonglong offset,ulonglong increment,ulonglong nb_desired_values,ulonglong * first_value,ulonglong * nb_reserved_values)4371 ha_innopart::get_auto_increment(
4372 	ulonglong	offset,
4373 	ulonglong	increment,
4374 	ulonglong	nb_desired_values,
4375 	ulonglong*	first_value,
4376 	ulonglong*	nb_reserved_values)
4377 {
4378 	DBUG_ENTER("ha_innopart::get_auto_increment");
4379 	if (table_share->next_number_keypart != 0) {
4380 		/* Only first key part allowed as autoinc for InnoDB tables! */
4381 		ut_ad(0);
4382 		*first_value = ULLONG_MAX;
4383 		DBUG_VOID_RETURN;
4384 	}
4385 	get_auto_increment_first_field(
4386 		increment,
4387 		nb_desired_values,
4388 		first_value,
4389 		nb_reserved_values);
4390 	DBUG_VOID_RETURN;
4391 }
4392 
4393 /** Get partition row type
4394 @param[in] Id of partition for which row type to be retrieved
4395 @return Partition row type */
get_partition_row_type(uint part_id)4396 enum row_type ha_innopart::get_partition_row_type(
4397         uint part_id)
4398 {
4399 	set_partition(part_id);
4400 	return get_row_type();
4401 }
4402 
4403 /** Compares two 'refs'.
4404 A 'ref' is the (internal) primary key value of the row.
4405 If there is no explicitly declared non-null unique key or a primary key, then
4406 InnoDB internally uses the row id as the primary key.
4407 It will use the partition id as secondary compare.
4408 @param[in]	ref1	An (internal) primary key value in the MySQL key value
4409 format.
4410 @param[in]	ref2	Reference to compare with (same type as ref1).
4411 @return	< 0 if ref1 < ref2, 0 if equal, else > 0. */
4412 int
cmp_ref(const uchar * ref1,const uchar * ref2)4413 ha_innopart::cmp_ref(
4414 	const uchar*	ref1,
4415 	const uchar*	ref2)
4416 {
4417 	int	cmp;
4418 
4419 	cmp = ha_innobase::cmp_ref(ref1 + PARTITION_BYTES_IN_POS,
4420 				   ref2 + PARTITION_BYTES_IN_POS);
4421 
4422 	if (cmp != 0) {
4423 		return(cmp);
4424 	}
4425 
4426 	cmp = static_cast<int>(uint2korr(ref1))
4427 		- static_cast<int>(uint2korr(ref2));
4428 
4429 	return(cmp);
4430 }
4431 
4432 /** Prepare for creating new partitions during ALTER TABLE ... PARTITION.
4433 @param[in]	num_partitions	Number of new partitions to be created.
4434 @param[in]	only_create	True if only creating the partition
4435 (no open/lock is needed).
4436 @return	0 for success else error code. */
4437 int
prepare_for_new_partitions(uint num_partitions,bool only_create)4438 ha_innopart::prepare_for_new_partitions(
4439 	uint	num_partitions,
4440 	bool	only_create)
4441 {
4442 	m_new_partitions = UT_NEW(Altered_partitions(num_partitions,
4443 						     only_create),
4444 				  mem_key_partitioning);
4445 	if (m_new_partitions == NULL) {
4446 		return(HA_ERR_OUT_OF_MEM);
4447 	}
4448 	if (m_new_partitions->initialize()) {
4449 		UT_DELETE(m_new_partitions);
4450 		m_new_partitions = NULL;
4451 		return(HA_ERR_OUT_OF_MEM);
4452 	}
4453 	return(0);
4454 }
4455 
4456 /** Create a new partition to be filled during ALTER TABLE ... PARTITION.
4457 @param[in]	table		Table to create the partition in.
4458 @param[in]	create_info	Table/partition specific create info.
4459 @param[in]	part_name	Partition name.
4460 @param[in]	new_part_id	Partition id in new table.
4461 @param[in]	part_elem	Partition element.
4462 @return	0 for success else error code. */
4463 int
create_new_partition(TABLE * table,HA_CREATE_INFO * create_info,const char * part_name,uint new_part_id,partition_element * part_elem)4464 ha_innopart::create_new_partition(
4465 	TABLE*			table,
4466 	HA_CREATE_INFO*		create_info,
4467 	const char*		part_name,
4468 	uint			new_part_id,
4469 	partition_element*	part_elem)
4470 {
4471 	int		error;
4472 	char		norm_name[FN_REFLEN];
4473 	const char*	tablespace_name_backup = create_info->tablespace;
4474 	const char*	data_file_name_backup = create_info->data_file_name;
4475 	DBUG_ENTER("ha_innopart::create_new_partition");
4476 	/* Delete by ddl_log on failure. */
4477 	normalize_table_name(norm_name, part_name);
4478 	set_create_info_dir(part_elem, create_info);
4479 
4480 	/* The below check is the same as for CREATE TABLE, but since we are
4481 	doing an alter here it will not trigger the check in
4482 	create_option_tablespace_is_valid(). */
4483 	if (tablespace_is_shared_space(create_info)
4484 	    && create_info->data_file_name != NULL
4485 	    && create_info->data_file_name[0] != '\0') {
4486 		my_printf_error(ER_ILLEGAL_HA_CREATE_OPTION,
4487 			"InnoDB: DATA DIRECTORY cannot be used"
4488 			" with a TABLESPACE assignment.", MYF(0));
4489 		DBUG_RETURN(HA_WRONG_CREATE_OPTION);
4490 	}
4491 
4492 	if (tablespace_is_shared_space(create_info)) {
4493 		push_deprecated_warn_no_replacement(
4494 			ha_thd(), PARTITION_IN_SHARED_TABLESPACE_WARNING);
4495 	}
4496 
4497 	error = ha_innobase::create(norm_name, table, create_info);
4498 	create_info->tablespace = tablespace_name_backup;
4499 	create_info->data_file_name = data_file_name_backup;
4500 	if (error == HA_ERR_FOUND_DUPP_KEY) {
4501 		DBUG_RETURN(HA_ERR_TABLE_EXIST);
4502 	}
4503 	if (error != 0) {
4504 		DBUG_RETURN(error);
4505 	}
4506 	if (!m_new_partitions->only_create())
4507 	{
4508 		dict_table_t* part;
4509 		part = dict_table_open_on_name(norm_name,
4510 					       false,
4511 					       true,
4512 					       DICT_ERR_IGNORE_NONE);
4513 		if (part == NULL) {
4514 			DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4515 		}
4516 		m_new_partitions->set_part(new_part_id, part);
4517 	}
4518 	DBUG_RETURN(0);
4519 }
4520 
4521 /** Close and finalize new partitions. */
4522 void
close_new_partitions()4523 ha_innopart::close_new_partitions()
4524 {
4525 	if (m_new_partitions != NULL) {
4526 		UT_DELETE(m_new_partitions);
4527 		m_new_partitions = NULL;
4528 	}
4529 }
4530 
4531 /** write row to new partition.
4532 @param[in]	new_part	New partition to write to.
4533 @return	0 for success else error code. */
4534 int
write_row_in_new_part(uint new_part)4535 ha_innopart::write_row_in_new_part(
4536 	uint	new_part)
4537 {
4538 	int	result;
4539 	DBUG_ENTER("ha_innopart::write_row_in_new_part");
4540 
4541 	m_last_part = new_part;
4542 	if (m_new_partitions->part(new_part) == NULL) {
4543 		/* Altered partition contains misplaced row. */
4544 		m_err_rec = table->record[0];
4545 		DBUG_RETURN(HA_ERR_ROW_IN_WRONG_PARTITION);
4546 	}
4547 	m_new_partitions->get_prebuilt(m_prebuilt, new_part);
4548 	result = ha_innobase::write_row(table->record[0]);
4549 	m_new_partitions->set_from_prebuilt(m_prebuilt, new_part);
4550 	DBUG_RETURN(result);
4551 }
4552 
4553 /** Allocate the array to hold blob heaps for all partitions */
4554 mem_heap_t**
alloc_blob_heap_array()4555 ha_innopart::alloc_blob_heap_array()
4556 {
4557 	DBUG_ENTER("ha_innopart::alloc_blob_heap_array");
4558 
4559 	const ulint	len = sizeof(mem_heap_t*) * m_tot_parts;
4560 	m_blob_heap_parts = static_cast<mem_heap_t**>(
4561 		ut_zalloc(len, mem_key_partitioning));
4562 	if (m_blob_heap_parts == NULL) {
4563 		DBUG_RETURN(NULL);
4564 	}
4565 
4566 	DBUG_RETURN(m_blob_heap_parts);
4567 }
4568 
4569 /** Free the array that holds blob heaps for all partitions */
4570 void
free_blob_heap_array()4571 ha_innopart::free_blob_heap_array()
4572 {
4573 	DBUG_ENTER("ha_innopart::free_blob_heap_array");
4574 
4575 	if (m_blob_heap_parts != NULL) {
4576 		clear_blob_heaps();
4577 		ut_free(m_blob_heap_parts);
4578 		m_blob_heap_parts = NULL;
4579 	}
4580 
4581 	DBUG_VOID_RETURN;
4582 }
4583 
4584 void
clear_blob_heaps()4585 ha_innopart::clear_blob_heaps()
4586 {
4587 	DBUG_ENTER("ha_innopart::clear_blob_heaps");
4588 
4589 	if (m_blob_heap_parts == NULL) {
4590 		DBUG_VOID_RETURN;
4591 	}
4592 
4593 	for (uint i = 0; i < m_tot_parts; i++) {
4594 		if (m_blob_heap_parts[i] != NULL) {
4595 			DBUG_PRINT("ha_innopart", ("freeing blob_heap: %p",
4596 						   m_blob_heap_parts[i]));
4597 			mem_heap_free(m_blob_heap_parts[i]);
4598 			m_blob_heap_parts[i] = NULL;
4599 		}
4600 	}
4601 
4602 	/* Reset blob_heap in m_prebuilt after freeing all heaps. It is set in
4603 	ha_innopart::set_partition to the blob heap of current partition. */
4604 	m_prebuilt->blob_heap = NULL;
4605 
4606 	DBUG_VOID_RETURN;
4607 }
4608 
4609 /** Reset state of file to after 'open'. This function is called
4610 after every statement for all tables used by that statement. */
4611 int
reset()4612 ha_innopart::reset()
4613 {
4614 	DBUG_ENTER("ha_innopart::reset");
4615 
4616 	clear_blob_heaps();
4617 
4618 	DBUG_RETURN(ha_innobase::reset());
4619 }
4620 
4621 /**
4622  Read row using position using given record to find.
4623 
4624 This works as position()+rnd_pos() functions, but does some
4625 extra work,calculating m_last_part - the partition to where
4626 the 'record' should go.	Only useful when position is based
4627 on primary key (HA_PRIMARY_KEY_REQUIRED_FOR_POSITION).
4628 
4629 @param[in]	record	Current record in MySQL Row Format.
4630 @return	0 for success else error code. */
4631 int
rnd_pos_by_record(uchar * record)4632 ha_innopart::rnd_pos_by_record(uchar*  record)
4633 {
4634 	int error;
4635 	DBUG_ENTER("ha_innopart::rnd_pos_by_record");
4636 	assert(ha_table_flags() &
4637 	       HA_PRIMARY_KEY_REQUIRED_FOR_POSITION);
4638 	/* TODO: Support HA_READ_BEFORE_WRITE_REMOVAL */
4639 	/* Set m_last_part correctly. */
4640 	if (unlikely(get_part_for_delete(record,
4641 					 m_table->record[0],
4642 					 m_part_info,
4643 					 &m_last_part))) {
4644 		DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4645 	}
4646 
4647 	/* Init only the partition in which row resides */
4648 	error = rnd_init_in_part(m_last_part, false);
4649 	if (error != 0) {
4650 		goto err;
4651 	}
4652 
4653 	position(record);
4654 	error = handler::ha_rnd_pos(record, ref);
4655 err:
4656 	rnd_end_in_part(m_last_part,FALSE);
4657 	DBUG_RETURN(error);
4658 }
4659 
4660 /****************************************************************************
4661  * DS-MRR implementation
4662  ***************************************************************************/
4663 
4664 /* TODO: move the default implementations into the base handler class! */
4665 /* TODO: See if it could be optimized for partitioned tables? */
4666 /* Use default ha_innobase implementation for now... */
4667