1 /*****************************************************************************
2
3 Copyright (c) 2014, 2021, Oracle and/or its affiliates.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation. The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License, version 2.0, for more details.
20
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24
25 *****************************************************************************/
26
27 /** @file ha_innopart.cc
28 Code for native partitioning in InnoDB.
29
30 Created Nov 22, 2013 Mattias Jonsson */
31
32 #include "univ.i"
33
34 /* Include necessary SQL headers */
35 #include <debug_sync.h>
36 #include <log.h>
37 #include <strfunc.h>
38 #include <sql_acl.h>
39 #include <sql_class.h>
40 #include <sql_show.h>
41 #include <sql_table.h>
42 #include <my_check_opt.h>
43
44 /* Include necessary InnoDB headers */
45 #include "btr0sea.h"
46 #include "dict0dict.h"
47 #include "dict0stats.h"
48 #include "lock0lock.h"
49 #include "row0import.h"
50 #include "row0merge.h"
51 #include "row0mysql.h"
52 #include "row0quiesce.h"
53 #include "row0sel.h"
54 #include "row0ins.h"
55 #include "row0upd.h"
56 #include "fsp0sysspace.h"
57 #include "ut0ut.h"
58
59 #include "ha_innodb.h"
60 #include "ha_innopart.h"
61 #include "partition_info.h"
62 #include "key.h"
63 #include "dict0priv.h"
64
65 #define INSIDE_HA_INNOPART_CC
66
67 /* To be backwards compatible we also fold partition separator on windows. */
68 #ifdef _WIN32
69 const char* part_sep = "#p#";
70 const char* sub_sep = "#sp#";
71 #else
72 const char* part_sep = "#P#";
73 const char* sub_sep = "#SP#";
74 #endif /* _WIN32 */
75
76 /* Partition separator for *nix platforms */
77 const char* part_sep_nix = "#P#";
78 const char* sub_sep_nix = "#SP#";
79
80 extern char* innobase_file_format_max;
81
Ha_innopart_share(TABLE_SHARE * table_share)82 Ha_innopart_share::Ha_innopart_share(
83 TABLE_SHARE* table_share)
84 :
85 Partition_share(),
86 m_table_parts(),
87 m_index_mapping(),
88 m_tot_parts(),
89 m_index_count(),
90 m_ref_count(),
91 m_table_share(table_share)
92 {}
93
~Ha_innopart_share()94 Ha_innopart_share::~Ha_innopart_share()
95 {
96 ut_ad(m_ref_count == 0);
97 if (m_table_parts != NULL) {
98 ut_free(m_table_parts);
99 m_table_parts = NULL;
100 }
101 if (m_index_mapping != NULL) {
102 ut_free(m_index_mapping);
103 m_index_mapping = NULL;
104 }
105 }
106
107 /** Fold to lower case if windows or lower_case_table_names == 1.
108 @param[in,out] s String to fold.*/
109 void
partition_name_casedn_str(char * s)110 Ha_innopart_share::partition_name_casedn_str(
111 char* s)
112 {
113 #ifdef _WIN32
114 innobase_casedn_str(s);
115 #endif
116 }
117
118 /** Translate and append partition name.
119 @param[out] to String to write in filesystem charset
120 @param[in] from Name in system charset
121 @param[in] sep Separator
122 @param[in] len Max length of to buffer
123 @return length of written string. */
124 size_t
append_sep_and_name(char * to,const char * from,const char * sep,size_t len)125 Ha_innopart_share::append_sep_and_name(
126 char* to,
127 const char* from,
128 const char* sep,
129 size_t len)
130 {
131 size_t ret;
132 size_t sep_len = strlen(sep);
133
134 ut_ad(len > sep_len + strlen(from));
135 ut_ad(to != NULL);
136 ut_ad(from != NULL);
137 ut_ad(from[0] != '\0');
138 memcpy(to, sep, sep_len);
139
140 ret = tablename_to_filename(from, to + sep_len,
141 len - sep_len);
142
143 /* Don't convert to lower case for nix style name. */
144 if (strcmp(sep, part_sep_nix) != 0
145 && strcmp(sep, sub_sep_nix) != 0) {
146
147 partition_name_casedn_str(to);
148 }
149
150 return(ret + sep_len);
151 }
152
153 /** Copy a cached MySQL row.
154 If requested, also avoids overwriting non-read columns.
155 @param[out] buf Row in MySQL format.
156 @param[in] cached_row Which row to copy. */
157 inline
158 void
copy_cached_row(uchar * buf,const uchar * cached_row)159 ha_innopart::copy_cached_row(
160 uchar* buf,
161 const uchar* cached_row)
162 {
163 if (m_prebuilt->keep_other_fields_on_keyread) {
164 row_sel_copy_cached_fields_for_mysql(buf, cached_row,
165 m_prebuilt);
166 } else {
167 memcpy(buf, cached_row, m_rec_length);
168 }
169 }
170
171 /** Open one partition.
172 @param[in] part_id Partition id to open.
173 @param[in] partition_name Name of internal innodb table to open.
174 @return false on success else true. */
175 bool
open_one_table_part(uint part_id,const char * partition_name)176 Ha_innopart_share::open_one_table_part(
177 uint part_id,
178 const char* partition_name)
179 {
180 char norm_name[FN_REFLEN];
181
182 normalize_table_name(norm_name, partition_name);
183 m_table_parts[part_id] =
184 ha_innobase::open_dict_table(partition_name, norm_name,
185 TRUE, DICT_ERR_IGNORE_NONE);
186
187 if (m_table_parts[part_id] == NULL) {
188 return(true);
189 }
190
191 dict_table_t *ib_table = m_table_parts[part_id];
192 if ((!DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID)
193 && m_table_share->fields
194 != (dict_table_get_n_user_cols(ib_table)
195 + dict_table_get_n_v_cols(ib_table)))
196 || (DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID)
197 && (m_table_share->fields
198 != dict_table_get_n_user_cols(ib_table)
199 + dict_table_get_n_v_cols(ib_table) - 1))) {
200 ib::warn() << "Partition `" << get_partition_name(part_id)
201 << "` contains " << dict_table_get_n_user_cols(ib_table)
202 << " user defined columns in InnoDB, but "
203 << m_table_share->fields
204 << " columns in MySQL. Please check"
205 " INFORMATION_SCHEMA.INNODB_SYS_COLUMNS and " REFMAN
206 "innodb-troubleshooting.html for how to resolve the"
207 " issue.";
208
209 /* Mark this partition as corrupted, so the drop table
210 or force recovery can still use it, but not others.
211 TODO: persist table->corrupted so it will be retained on
212 restart and out-of-bounds operations will see it. */
213
214 ib_table->corrupted = true;
215 dict_table_close(ib_table, FALSE, FALSE);
216 }
217
218 /* TODO: To save memory, compare with first partition and reuse
219 the column names etc. in the internal InnoDB meta-data cache. */
220
221 return(false);
222 }
223
224 /** Set up the virtual column template for partition table, and points
225 all m_table_parts[]->vc_templ to it.
226 @param[in] table MySQL TABLE object
227 @param[in] ib_table InnoDB dict_table_t
228 @param[in] table_name Table name (db/table_name) */
229 void
set_v_templ(TABLE * table,dict_table_t * ib_table,const char * name)230 Ha_innopart_share::set_v_templ(
231 TABLE* table,
232 dict_table_t* ib_table,
233 const char* name)
234 {
235 ut_ad(mutex_own(&dict_sys->mutex));
236
237 if (ib_table->n_v_cols > 0) {
238 for (ulint i = 0; i < m_tot_parts; i++) {
239 if (m_table_parts[i]->vc_templ == NULL) {
240 m_table_parts[i]->vc_templ
241 = UT_NEW_NOKEY(dict_vcol_templ_t());
242 m_table_parts[i]->vc_templ->vtempl = NULL;
243 } else if (m_table_parts[i]->get_ref_count() == 1) {
244 /* Clean and refresh the template */
245 dict_free_vc_templ(m_table_parts[i]->vc_templ);
246 m_table_parts[i]->vc_templ->vtempl = NULL;
247 }
248
249 if (m_table_parts[i]->vc_templ->vtempl == NULL) {
250 innobase_build_v_templ(
251 table, ib_table,
252 m_table_parts[i]->vc_templ,
253 NULL, true, name);
254 }
255 }
256 }
257 }
258
259 /** Initialize the share with table and indexes per partition.
260 @param[in] part_info Partition info (partition names to use).
261 @param[in] table_name Table name (db/table_name).
262 @return false on success else true. */
263 bool
open_table_parts(partition_info * part_info,const char * table_name)264 Ha_innopart_share::open_table_parts(
265 partition_info* part_info,
266 const char* table_name)
267 {
268 size_t table_name_len;
269 size_t len;
270 uint ib_num_index;
271 uint mysql_num_index;
272 char partition_name[FN_REFLEN];
273 bool index_loaded = true;
274
275 #ifndef NDEBUG
276 if (m_table_share->tmp_table == NO_TMP_TABLE) {
277 mysql_mutex_assert_owner(&m_table_share->LOCK_ha_data);
278 }
279 #endif /* NDEBUG */
280 m_ref_count++;
281 if (m_table_parts != NULL) {
282 ut_ad(m_ref_count > 1);
283 ut_ad(m_tot_parts > 0);
284
285 /* Increment dict_table_t reference count for all partitions */
286 mutex_enter(&dict_sys->mutex);
287 for (uint i = 0; i < m_tot_parts; i++) {
288 dict_table_t* table = m_table_parts[i];
289 table->acquire();
290 ut_ad(table->get_ref_count() >= m_ref_count);
291 }
292 mutex_exit(&dict_sys->mutex);
293
294 return(false);
295 }
296 ut_ad(m_ref_count == 1);
297 m_tot_parts = part_info->get_tot_partitions();
298 size_t table_parts_size = sizeof(dict_table_t*) * m_tot_parts;
299 m_table_parts = static_cast<dict_table_t**>(
300 ut_zalloc(table_parts_size, mem_key_partitioning));
301 if (m_table_parts == NULL) {
302 m_ref_count--;
303 return(true);
304 }
305
306 /* Set up the array over all table partitions. */
307 table_name_len = strlen(table_name);
308 memcpy(partition_name, table_name, table_name_len);
309 List_iterator<partition_element>
310 part_it(part_info->partitions);
311 partition_element* part_elem;
312 uint i = 0;
313
314 while ((part_elem = part_it++)) {
315 len = append_sep_and_name(
316 partition_name + table_name_len,
317 part_elem->partition_name,
318 part_sep_nix,
319 FN_REFLEN - table_name_len);
320 if (part_info->is_sub_partitioned()) {
321 List_iterator<partition_element>
322 sub_it(part_elem->subpartitions);
323 partition_element* sub_elem;
324 while ((sub_elem = sub_it++)) {
325 append_sep_and_name(
326 partition_name
327 + table_name_len + len,
328 sub_elem->partition_name,
329 sub_sep_nix,
330 FN_REFLEN - table_name_len - len);
331 if (open_one_table_part(i, partition_name)) {
332 goto err;
333 }
334 i++;
335 }
336 } else {
337 if (open_one_table_part(i, partition_name)) {
338 goto err;
339 }
340 i++;
341 }
342 }
343 ut_ad(i == m_tot_parts);
344
345 /* Create the mapping of mysql index number to innodb indexes. */
346
347 ib_num_index = (uint) UT_LIST_GET_LEN(m_table_parts[0]->indexes);
348 mysql_num_index = part_info->table->s->keys;
349
350 /* If there exists inconsistency between MySQL and InnoDB dictionary
351 (metadata) information, the number of index defined in MySQL
352 could exceed that in InnoDB, do not build index translation
353 table in such case. */
354
355 if (ib_num_index < mysql_num_index) {
356 ut_ad(0);
357 goto err;
358 }
359
360 if (mysql_num_index != 0) {
361 size_t alloc_size = mysql_num_index * m_tot_parts
362 * sizeof(*m_index_mapping);
363 m_index_mapping = static_cast<dict_index_t**>(
364 ut_zalloc(alloc_size, mem_key_partitioning));
365 if (m_index_mapping == NULL) {
366
367 /* Report an error if index_mapping continues to be
368 NULL and mysql_num_index is a non-zero value. */
369
370 ib::error() << "Failed to allocate memory for"
371 " index translation table. Number of"
372 " Index:" << mysql_num_index;
373 goto err;
374 }
375 }
376
377 /* For each index in the mysql key_info array, fetch its
378 corresponding InnoDB index pointer into index_mapping
379 array. */
380
381 for (ulint idx = 0; idx < mysql_num_index; idx++) {
382 for (ulint part = 0; part < m_tot_parts; part++) {
383 ulint count = part * mysql_num_index + idx;
384
385 /* Fetch index pointers into index_mapping according
386 to mysql index sequence. */
387
388 m_index_mapping[count] = dict_table_get_index_on_name(
389 m_table_parts[part],
390 part_info->table->key_info[idx].name);
391
392 if (m_index_mapping[count] == NULL) {
393 ib::error() << "Cannot find index `"
394 << part_info->table->key_info[idx].name
395 << "` in InnoDB index dictionary"
396 " partition `"
397 << get_partition_name(part) << "`.";
398 index_loaded = false;
399 break;
400 }
401
402 /* Double check fetched index has the same
403 column info as those in mysql key_info. */
404
405 if (!innobase_match_index_columns(
406 &part_info->table->key_info[idx],
407 m_index_mapping[count])) {
408 ib::error() << "Found index `"
409 << part_info->table->key_info[idx].name
410 << "` whose column info does not match"
411 " that of MySQL.";
412 index_loaded = false;
413 break;
414 }
415 }
416 }
417 if (!index_loaded && m_index_mapping != NULL) {
418 ut_free(m_index_mapping);
419 m_index_mapping = NULL;
420 }
421
422 /* Successfully built the translation table. */
423 m_index_count = mysql_num_index;
424
425 return(false);
426 err:
427 close_table_parts();
428
429 return(true);
430 }
431
432 /** Close all partitions. */
433 void
close_table_parts()434 Ha_innopart_share::close_table_parts()
435 {
436 #ifndef NDEBUG
437 if (m_table_share->tmp_table == NO_TMP_TABLE) {
438 mysql_mutex_assert_owner(&m_table_share->LOCK_ha_data);
439 }
440 #endif /* NDEBUG */
441 m_ref_count--;
442 if (m_ref_count != 0) {
443
444 /* Decrement dict_table_t reference count for all partitions */
445 mutex_enter(&dict_sys->mutex);
446 for (uint i = 0; i < m_tot_parts; i++) {
447 dict_table_t* table = m_table_parts[i];
448 table->release();
449 ut_ad(table->get_ref_count() >= m_ref_count);
450 }
451 mutex_exit(&dict_sys->mutex);
452
453 return;
454 }
455
456 /* Last instance closed, close all table partitions and
457 free the memory. */
458
459 mutex_enter(&dict_sys->mutex);
460 if (m_table_parts != NULL) {
461 for (uint i = 0; i < m_tot_parts; i++) {
462 if (m_table_parts[i] != NULL) {
463 dict_table_close(m_table_parts[i], TRUE, TRUE);
464 }
465 }
466 ut_free(m_table_parts);
467 m_table_parts = NULL;
468 }
469 mutex_exit(&dict_sys->mutex);
470 if (m_index_mapping != NULL) {
471 ut_free(m_index_mapping);
472 m_index_mapping = NULL;
473 }
474
475 m_tot_parts = 0;
476 m_index_count = 0;
477 }
478
479 /** Get index.
480 Find the index of the specified partition and key number.
481 @param[in] part_id Partition number.
482 @param[in] keynr Key number.
483 @return Index pointer or NULL. */
484 inline
485 dict_index_t*
get_index(uint part_id,uint keynr)486 Ha_innopart_share::get_index(
487 uint part_id,
488 uint keynr)
489 {
490 ut_a(part_id < m_tot_parts);
491 ut_ad(keynr < m_index_count || keynr == MAX_KEY);
492 if (m_index_mapping == NULL
493 || keynr >= m_index_count) {
494
495 if (keynr == MAX_KEY) {
496 return(dict_table_get_first_index(
497 get_table_part(part_id)));
498 }
499 return(NULL);
500 }
501 return(m_index_mapping[m_index_count * part_id + keynr]);
502 }
503
504 /** Get MySQL key number corresponding to InnoDB index.
505 Calculates the key number used inside MySQL for an Innobase index. We will
506 first check the "index translation table" for a match of the index to get
507 the index number. If there does not exist an "index translation table",
508 or not able to find the index in the translation table, then we will fall back
509 to the traditional way of looping through dict_index_t list to find a
510 match. In this case, we have to take into account if we generated a
511 default clustered index for the table
512 @param[in] part_id Partition the index belongs to.
513 @param[in] index Index to return MySQL key number for.
514 @return the key number used inside MySQL or UINT_MAX if key is not found. */
515 inline
516 uint
get_mysql_key(uint part_id,const dict_index_t * index)517 Ha_innopart_share::get_mysql_key(
518 uint part_id,
519 const dict_index_t* index)
520 {
521 ut_ad(index != NULL);
522 ut_ad(m_index_mapping != NULL);
523 ut_ad(m_tot_parts);
524
525 if (index != NULL && m_index_mapping != NULL) {
526 uint start;
527 uint end;
528
529 if (part_id < m_tot_parts) {
530 start = part_id * m_index_count;
531 end = start + m_index_count;
532 } else {
533 start = 0;
534 end = m_tot_parts * m_index_count;
535 }
536 for (uint i = start; i < end; i++) {
537 if (m_index_mapping[i] == index) {
538 return(i % m_index_count);
539 }
540 }
541
542 /* Print an error message if we cannot find the index
543 in the "index translation table". */
544
545 if (index->is_committed()) {
546 ib::error() << "Cannot find index "
547 << index->name
548 << " in InnoDB index translation table.";
549 }
550 }
551
552 return(UINT_MAX);
553 }
554
555 /** Helper function for set bit in bitmap.
556 @param[in,out] buf Bitmap buffer to update bit in.
557 @param[in] bit_pos Bit number (index starts at 0). */
558 static
559 inline
560 void
set_bit(byte * buf,size_t pos)561 set_bit(
562 byte* buf,
563 size_t pos)
564 {
565 buf[pos/8] |= (0x1 << (pos & 0x7));
566 }
567
568 /** Helper function for clear bit in bitmap.
569 @param[in,out] buf Bitmap buffer to update bit in.
570 @param[in] bit_pos Bit number (index starts at 0). */
571 static
572 inline
573 void
clear_bit(byte * buf,size_t pos)574 clear_bit(
575 byte* buf,
576 size_t pos)
577 {
578 buf[pos/8] &= ~(0x1 << (pos & 0x7));
579 }
580
581 /** Helper function for get bit in bitmap.
582 @param[in,out] buf Bitmap buffer.
583 @param[in] bit_pos Bit number (index starts at 0).
584 @return byte set to 0x0 or 0x1.
585 @retval 0x0 bit not set.
586 @retval 0x1 bet set. */
587 static
588 inline
589 byte
get_bit(byte * buf,size_t pos)590 get_bit(
591 byte* buf,
592 size_t pos)
593 {
594 return((buf[pos/8] >> (pos & 0x7)) & 0x1);
595 }
596
597 /** Helper class for encapsulating new/altered partitions during
598 ADD/REORG/... PARTITION. */
599 class Altered_partitions
600 {
601 private:
602 /** New partitions during ADD/REORG/... PARTITION. */
603 dict_table_t** m_new_table_parts;
604
605 /** Insert nodes per partition. */
606 ins_node_t** m_ins_nodes;
607
608 /** sql_stat_start per partition. */
609 byte* m_sql_stat_start;
610
611 /** Trx id per partition. */
612 trx_id_t* m_trx_ids;
613
614 /** Number of new partitions. */
615 size_t m_num_new_parts;
616
617 /** Only need to create the partitions (no open/lock). */
618 bool m_only_create;
619
620 public:
621 Altered_partitions(
622 uint n_partitions,
623 bool only_create);
624
625 ~Altered_partitions();
626
627 bool
628 initialize();
629
630 bool
only_create() const631 only_create() const
632 {
633 return(m_only_create);
634 }
635
636 /** Set currently used partition.
637 @param[in] new_part_id Partition id to set.
638 @param[in] part InnoDB table to use. */
639 inline
640 void
set_part(ulint new_part_id,dict_table_t * part)641 set_part(
642 ulint new_part_id,
643 dict_table_t* part)
644 {
645 ut_ad(m_new_table_parts[new_part_id] == NULL);
646 m_new_table_parts[new_part_id] = part;
647 set_bit(m_sql_stat_start, new_part_id);
648 }
649
650 /** Get lower level InnoDB table for partition.
651 @param[in] part_id Partition id.
652 @return Lower level InnoDB table for the partition id. */
653 inline
654 dict_table_t*
part(uint part_id) const655 part(
656 uint part_id) const
657 {
658 ut_ad(part_id < m_num_new_parts);
659 return(m_new_table_parts[part_id]);
660 }
661
662 /** Set up prebuilt for using a specified partition.
663 @param[in] prebuilt Prebuilt to update.
664 @param[in] new_part_id Partition to use. */
665 inline
666 void
get_prebuilt(row_prebuilt_t * prebuilt,uint new_part_id) const667 get_prebuilt(
668 row_prebuilt_t* prebuilt,
669 uint new_part_id) const
670 {
671 ut_ad(m_new_table_parts[new_part_id]);
672 prebuilt->table = m_new_table_parts[new_part_id];
673 prebuilt->ins_node = m_ins_nodes[new_part_id];
674 prebuilt->trx_id = m_trx_ids[new_part_id];
675 prebuilt->sql_stat_start = get_bit(m_sql_stat_start,
676 new_part_id);
677 }
678
679 /** Update cached values for a partition from prebuilt.
680 @param[in] prebuilt Prebuilt to copy from.
681 @param[in] new_part_id Partition id to copy. */
682 inline
683 void
set_from_prebuilt(row_prebuilt_t * prebuilt,uint new_part_id)684 set_from_prebuilt(
685 row_prebuilt_t* prebuilt,
686 uint new_part_id)
687 {
688 ut_ad(m_new_table_parts[new_part_id] == prebuilt->table);
689 m_ins_nodes[new_part_id] = prebuilt->ins_node;
690 m_trx_ids[new_part_id] = prebuilt->trx_id;
691 if (prebuilt->sql_stat_start == 0) {
692 clear_bit(m_sql_stat_start, new_part_id);
693 }
694 }
695 };
696
Altered_partitions(uint n_partitions,bool only_create)697 Altered_partitions::Altered_partitions(
698 uint n_partitions,
699 bool only_create)
700 :
701 m_new_table_parts(),
702 m_ins_nodes(),
703 m_sql_stat_start(),
704 m_trx_ids(),
705 m_num_new_parts(n_partitions),
706 m_only_create(only_create)
707 {}
708
~Altered_partitions()709 Altered_partitions::~Altered_partitions()
710 {
711 if (m_new_table_parts != NULL) {
712 for (ulint i = 0; i < m_num_new_parts; i++) {
713 if (m_new_table_parts[i] != NULL) {
714 dict_table_close(m_new_table_parts[i],
715 false, true);
716 }
717 }
718 ut_free(m_new_table_parts);
719 m_new_table_parts = NULL;
720 }
721 if (m_ins_nodes != NULL) {
722 for (ulint i = 0; i < m_num_new_parts; i++) {
723 if (m_ins_nodes[i] != NULL) {
724 ins_node_t* ins = m_ins_nodes[i];
725 ut_ad(ins->select == NULL);
726 que_graph_free_recursive(ins->select);
727 ins->select = NULL;
728 if (ins->entry_sys_heap != NULL) {
729 mem_heap_free(ins->entry_sys_heap);
730 ins->entry_sys_heap = NULL;
731 }
732 }
733 }
734 ut_free(m_ins_nodes);
735 m_ins_nodes = NULL;
736 }
737 if (m_sql_stat_start != NULL) {
738 ut_free(m_sql_stat_start);
739 m_sql_stat_start = NULL;
740 }
741 if (m_trx_ids != NULL) {
742 ut_free(m_trx_ids);
743 m_trx_ids = NULL;
744 }
745 }
746
747 /** Initialize the object.
748 @return false on success else true. */
749 bool
initialize()750 Altered_partitions::initialize()
751 {
752 size_t alloc_size = sizeof(*m_new_table_parts) * m_num_new_parts;
753 m_new_table_parts = static_cast<dict_table_t**>(
754 ut_zalloc(alloc_size, mem_key_partitioning));
755 if (m_new_table_parts == NULL) {
756 return(true);
757 }
758
759 alloc_size = sizeof(*m_ins_nodes) * m_num_new_parts;
760 m_ins_nodes = static_cast<ins_node_t**>(
761 ut_zalloc(alloc_size, mem_key_partitioning));
762 if (m_ins_nodes == NULL) {
763 ut_free(m_new_table_parts);
764 m_new_table_parts = NULL;
765 return(true);
766 }
767
768 alloc_size = sizeof(*m_sql_stat_start)
769 * UT_BITS_IN_BYTES(m_num_new_parts);
770 m_sql_stat_start = static_cast<byte*>(
771 ut_zalloc(alloc_size, mem_key_partitioning));
772 if (m_sql_stat_start == NULL) {
773 ut_free(m_new_table_parts);
774 m_new_table_parts = NULL;
775 ut_free(m_ins_nodes);
776 m_ins_nodes = NULL;
777 return(true);
778 }
779
780 alloc_size = sizeof(*m_trx_ids) * m_num_new_parts;
781 m_trx_ids = static_cast<trx_id_t*>(
782 ut_zalloc(alloc_size, mem_key_partitioning));
783 if (m_trx_ids == NULL) {
784 ut_free(m_new_table_parts);
785 m_new_table_parts = NULL;
786 ut_free(m_ins_nodes);
787 m_ins_nodes = NULL;
788 ut_free(m_sql_stat_start);
789 m_sql_stat_start = NULL;
790 return(true);
791 }
792
793 return(false);
794 }
795
796 /** Construct ha_innopart handler.
797 @param[in] hton Handlerton.
798 @param[in] table_arg MySQL Table.
799 @return a new ha_innopart handler. */
ha_innopart(handlerton * hton,TABLE_SHARE * table_arg)800 ha_innopart::ha_innopart(
801 handlerton* hton,
802 TABLE_SHARE* table_arg)
803 :
804 ha_innobase(hton, table_arg),
805 Partition_helper(this),
806 m_ins_node_parts(),
807 m_upd_node_parts(),
808 m_blob_heap_parts(),
809 m_trx_id_parts(),
810 m_row_read_type_parts(),
811 m_sql_stat_start_parts(),
812 m_pcur(),
813 m_clust_pcur(),
814 m_new_partitions()
815 {
816 m_int_table_flags &= ~(HA_INNOPART_DISABLED_TABLE_FLAGS);
817
818 /* INNOBASE_SHARE is not used in ha_innopart.
819 This also flags for ha_innobase that it is a partitioned table.
820 And make it impossible to use legacy share functionality. */
821
822 m_share = NULL;
823 }
824
825 /** Destruct ha_innopart handler. */
~ha_innopart()826 ha_innopart::~ha_innopart()
827 {}
828
829 /** Returned supported alter table flags.
830 @param[in] flags Flags to support.
831 @return Supported flags. */
832 uint
alter_table_flags(uint flags)833 ha_innopart::alter_table_flags(
834 uint flags)
835 {
836 return(HA_PARTITION_FUNCTION_SUPPORTED | HA_FAST_CHANGE_PARTITION);
837 }
838
839 /** Internally called for initializing auto increment value.
840 Only called from ha_innobase::discard_or_import_table_space()
841 and should not do anything, since it is ha_innopart will initialize
842 it on first usage. */
843 int
innobase_initialize_autoinc()844 ha_innopart::innobase_initialize_autoinc()
845 {
846 ut_ad(0);
847 return(0);
848 }
849
850 /** Set the autoinc column max value.
851 This should only be called once from ha_innobase::open().
852 Therefore there's no need for a covering lock.
853 @param[in] no_lock Ignored!
854 @return 0 for success or error code. */
855 inline
856 int
initialize_auto_increment(bool)857 ha_innopart::initialize_auto_increment(
858 bool /* no_lock */)
859 {
860 int error = 0;
861 ulonglong auto_inc = 0;
862 const Field* field = table->found_next_number_field;
863
864 #ifndef NDEBUG
865 if (table_share->tmp_table == NO_TMP_TABLE)
866 {
867 mysql_mutex_assert_owner(m_part_share->auto_inc_mutex);
868 }
869 #endif
870
871 /* Since a table can already be "open" in InnoDB's internal
872 data dictionary, we only init the autoinc counter once, the
873 first time the table is loaded. We can safely reuse the
874 autoinc value from a previous MySQL open. */
875
876 if (m_part_share->auto_inc_initialized) {
877 /* Already initialized, nothing to do. */
878 return(0);
879 }
880
881 if (field == NULL) {
882 ib::info() << "Unable to determine the AUTOINC column name";
883 }
884
885 if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
886 /* If the recovery level is set so high that writes
887 are disabled we force the AUTOINC counter to 0
888 value effectively disabling writes to the table.
889 Secondly, we avoid reading the table in case the read
890 results in failure due to a corrupted table/index.
891
892 We will not return an error to the client, so that the
893 tables can be dumped with minimal hassle. If an error
894 were returned in this case, the first attempt to read
895 the table would fail and subsequent SELECTs would succeed. */
896
897 } else if (field == NULL) {
898 /* This is a far more serious error, best to avoid
899 opening the table and return failure. */
900
901 my_error(ER_AUTOINC_READ_FAILED, MYF(0));
902 error = HA_ERR_AUTOINC_READ_FAILED;
903 } else {
904 dict_index_t* index;
905 const char* col_name;
906 ib_uint64_t read_auto_inc;
907 ib_uint64_t max_auto_inc = 0;
908 ulint err;
909 dict_table_t* ib_table;
910 ulonglong col_max_value;
911
912 col_max_value = field->get_max_int_value();
913
914 update_thd(ha_thd());
915
916 col_name = field->field_name;
917 for (uint part = 0; part < m_tot_parts; part++) {
918 ib_table = m_part_share->get_table_part(part);
919 dict_table_autoinc_lock(ib_table);
920 read_auto_inc = dict_table_autoinc_read(ib_table);
921 if (read_auto_inc != 0) {
922 set_if_bigger(max_auto_inc, read_auto_inc);
923 dict_table_autoinc_unlock(ib_table);
924 continue;
925 }
926 /* Execute SELECT MAX(col_name) FROM TABLE; */
927 index = m_part_share->get_index(
928 part, table->s->next_number_index);
929 err = row_search_max_autoinc(
930 index, col_name, &read_auto_inc);
931
932 switch (err) {
933 case DB_SUCCESS: {
934 /* At the this stage we do not know the
935 increment nor the offset,
936 so use a default increment of 1. */
937
938 auto_inc = innobase_next_autoinc(
939 read_auto_inc, 1, 1, 0, col_max_value);
940 set_if_bigger(max_auto_inc, auto_inc);
941 dict_table_autoinc_initialize(ib_table,
942 auto_inc);
943 break;
944 }
945 case DB_RECORD_NOT_FOUND:
946 ib::error() << "MySQL and InnoDB data"
947 " dictionaries are out of sync. Unable"
948 " to find the AUTOINC column "
949 << col_name << " in the InnoDB table "
950 << index->table->name << ". We set the"
951 " next AUTOINC column value to 0, in"
952 " effect disabling the AUTOINC next"
953 " value generation.";
954
955 ib::info() << "You can either set the next"
956 " AUTOINC value explicitly using ALTER"
957 " TABLE or fix the data dictionary by"
958 " recreating the table.";
959
960 /* We want the open to succeed, so that the
961 user can take corrective action. ie. reads
962 should succeed but updates should fail. */
963
964 /* This will disable the AUTOINC generation. */
965 auto_inc = 0;
966 goto done;
967 default:
968 /* row_search_max_autoinc() should only return
969 one of DB_SUCCESS or DB_RECORD_NOT_FOUND. */
970
971 ut_error;
972 }
973 dict_table_autoinc_unlock(ib_table);
974 }
975 auto_inc = max_auto_inc;
976 }
977
978 done:
979 m_part_share->next_auto_inc_val = auto_inc;
980 m_part_share->auto_inc_initialized = true;
981 return(error);
982 }
983
984 /** Opens a partitioned InnoDB table.
985 Initializes needed data and opens the table which already exists
986 in an InnoDB database.
987 @param[in] name Table name (db/tablename)
988 @param[in] mode Not used
989 @param[in] test_if_locked Not used
990 @return 0 or error number. */
991 int
open(const char * name,int,uint)992 ha_innopart::open(
993 const char* name,
994 int /*mode*/,
995 uint /*test_if_locked*/)
996 {
997 dict_table_t* ib_table;
998 char norm_name[FN_REFLEN];
999 THD* thd;
1000
1001 DBUG_ENTER("ha_innopart::open");
1002
1003 ut_ad(table);
1004 if (m_part_info == NULL) {
1005 /* Must be during ::clone()! */
1006 ut_ad(table->part_info != NULL);
1007 m_part_info = table->part_info;
1008 }
1009 thd = ha_thd();
1010
1011 /* Under some cases MySQL seems to call this function while
1012 holding search latch(es). This breaks the latching order as
1013 we acquire dict_sys->mutex below and leads to a deadlock. */
1014
1015 if (thd != NULL) {
1016 innobase_release_temporary_latches(ht, thd);
1017 }
1018
1019 normalize_table_name(norm_name, name);
1020
1021 m_user_thd = NULL;
1022
1023 /* Get the Ha_innopart_share from the TABLE_SHARE. */
1024 lock_shared_ha_data();
1025 m_part_share = static_cast<Ha_innopart_share*>(get_ha_share_ptr());
1026 if (m_part_share == NULL) {
1027 m_part_share = new (std::nothrow)
1028 Ha_innopart_share(table_share);
1029 if (m_part_share == NULL) {
1030 share_error:
1031 unlock_shared_ha_data();
1032 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
1033 }
1034 set_ha_share_ptr(static_cast<Handler_share*>(m_part_share));
1035 }
1036 if (m_part_share->open_table_parts(m_part_info, name)
1037 || m_part_share->populate_partition_name_hash(m_part_info)) {
1038 goto share_error;
1039 }
1040 if (m_part_share->auto_inc_mutex == NULL
1041 && table->found_next_number_field != NULL) {
1042 if (m_part_share->init_auto_inc_mutex(table_share)) {
1043 goto share_error;
1044 }
1045 }
1046 unlock_shared_ha_data();
1047
1048 /* Will be allocated if it is needed in ::update_row(). */
1049 m_upd_buf = NULL;
1050 m_upd_buf_size = 0;
1051
1052 /* Get pointer to a table object in InnoDB dictionary cache. */
1053 ib_table = m_part_share->get_table_part(0);
1054
1055 m_pcur_parts = NULL;
1056 m_clust_pcur_parts = NULL;
1057 m_pcur_map = NULL;
1058
1059 /* TODO: Handle mismatching #P# vs #p# in upgrading to new DD instead!
1060 See bug#58406, The problem exists when moving partitioned tables
1061 between Windows and Unix-like platforms. InnoDB always folds the name
1062 on windows, partitioning never folds partition (and #P# separator).
1063 I.e. non of it follows lower_case_table_names correctly :( */
1064
1065 if (open_partitioning(m_part_share))
1066 {
1067 close();
1068 DBUG_RETURN(HA_ERR_INITIALIZATION);
1069 }
1070
1071 /* Currently we track statistics for all partitions, but for
1072 the secondary indexes we only use the biggest partition. */
1073
1074 for (uint part_id = 0; part_id < m_tot_parts; part_id++) {
1075 innobase_copy_frm_flags_from_table_share(
1076 m_part_share->get_table_part(part_id),
1077 table->s);
1078 dict_stats_init(m_part_share->get_table_part(part_id));
1079 }
1080
1081 MONITOR_INC(MONITOR_TABLE_OPEN);
1082
1083 bool no_tablespace;
1084
1085 /* TODO: Should we do this check for every partition during ::open()? */
1086 /* TODO: refactor this in ha_innobase so it can increase code reuse. */
1087 if (dict_table_is_discarded(ib_table)) {
1088
1089 ib_senderrf(thd,
1090 IB_LOG_LEVEL_WARN, ER_TABLESPACE_DISCARDED,
1091 table->s->table_name.str);
1092
1093 /* Allow an open because a proper DISCARD should have set
1094 all the flags and index root page numbers to FIL_NULL that
1095 should prevent any DML from running but it should allow DDL
1096 operations. */
1097
1098 no_tablespace = false;
1099
1100 } else if (ib_table->file_unreadable) {
1101
1102 ib_senderrf(
1103 thd, IB_LOG_LEVEL_WARN,
1104 ER_TABLESPACE_MISSING, norm_name);
1105
1106 /* This means we have no idea what happened to the tablespace
1107 file, best to play it safe. */
1108
1109 no_tablespace = true;
1110 } else {
1111 no_tablespace = false;
1112 }
1113
1114 if (!thd_tablespace_op(thd) && no_tablespace) {
1115 set_my_errno(ENOENT);
1116 close();
1117 DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
1118 }
1119
1120 m_prebuilt = row_create_prebuilt(ib_table, table->s->reclength);
1121
1122 m_prebuilt->default_rec = table->s->default_values;
1123 ut_ad(m_prebuilt->default_rec);
1124
1125 assert(table != NULL);
1126 m_prebuilt->m_mysql_table = table;
1127 m_prebuilt->m_mysql_handler = this;
1128
1129 if (ib_table->n_v_cols > 0) {
1130 mutex_enter(&dict_sys->mutex);
1131 m_part_share->set_v_templ(table, ib_table, name);
1132 mutex_exit(&dict_sys->mutex);
1133 }
1134
1135 /* Looks like MySQL-3.23 sometimes has primary key number != 0. */
1136 m_primary_key = table->s->primary_key;
1137 key_used_on_scan = m_primary_key;
1138
1139 /* Allocate a buffer for a 'row reference'. A row reference is
1140 a string of bytes of length ref_length which uniquely specifies
1141 a row in our table. Note that MySQL may also compare two row
1142 references for equality by doing a simple memcmp on the strings
1143 of length ref_length! */
1144
1145 if (!row_table_got_default_clust_index(ib_table)) {
1146
1147 m_prebuilt->clust_index_was_generated = FALSE;
1148
1149 if (UNIV_UNLIKELY(m_primary_key >= MAX_KEY)) {
1150 table_name_t table_name;
1151 table_name.m_name = const_cast<char*>(name);
1152 ib::error() << "Table " << table_name
1153 << " has a primary key in InnoDB data"
1154 " dictionary, but not in MySQL!";
1155
1156 /* This mismatch could cause further problems
1157 if not attended, bring this to the user's attention
1158 by printing a warning in addition to log a message
1159 in the errorlog. */
1160
1161 push_warning_printf(thd, Sql_condition::SL_WARNING,
1162 ER_NO_SUCH_INDEX,
1163 "Table %s has a"
1164 " primary key in InnoDB data"
1165 " dictionary, but not in"
1166 " MySQL!", name);
1167
1168 /* If m_primary_key >= MAX_KEY, its (m_primary_key)
1169 value could be out of bound if continue to index
1170 into key_info[] array. Find InnoDB primary index,
1171 and assign its key_length to ref_length.
1172 In addition, since MySQL indexes are sorted starting
1173 with primary index, unique index etc., initialize
1174 ref_length to the first index key length in
1175 case we fail to find InnoDB cluster index.
1176
1177 Please note, this will not resolve the primary
1178 index mismatch problem, other side effects are
1179 possible if users continue to use the table.
1180 However, we allow this table to be opened so
1181 that user can adopt necessary measures for the
1182 mismatch while still being accessible to the table
1183 date. */
1184
1185 if (table->key_info == NULL) {
1186 ut_ad(table->s->keys == 0);
1187 ref_length = 0;
1188 } else {
1189 ref_length = table->key_info[0].key_length;
1190 }
1191
1192 /* Find corresponding cluster index
1193 key length in MySQL's key_info[] array. */
1194
1195 for (uint i = 0; i < table->s->keys; i++) {
1196 dict_index_t* index;
1197 index = innopart_get_index(0, i);
1198 if (dict_index_is_clust(index)) {
1199 ref_length =
1200 table->key_info[i].key_length;
1201 }
1202 }
1203 ut_a(ref_length);
1204 ref_length += PARTITION_BYTES_IN_POS;
1205 } else {
1206 /* MySQL allocates the buffer for ref.
1207 key_info->key_length includes space for all key
1208 columns + one byte for each column that may be
1209 NULL. ref_length must be as exact as possible to
1210 save space, because all row reference buffers are
1211 allocated based on ref_length. */
1212
1213 ref_length = table->key_info[m_primary_key].key_length;
1214 ref_length += PARTITION_BYTES_IN_POS;
1215 }
1216 } else {
1217 if (m_primary_key != MAX_KEY) {
1218 table_name_t table_name;
1219 table_name.m_name = const_cast<char*>(name);
1220 ib::error() << "Table " << table_name
1221 << " has no primary key in InnoDB data"
1222 " dictionary, but has one in MySQL! If you"
1223 " created the table with a MySQL version <"
1224 " 3.23.54 and did not define a primary key,"
1225 " but defined a unique key with all non-NULL"
1226 " columns, then MySQL internally treats that"
1227 " key as the primary key. You can fix this"
1228 " error by dump + DROP + CREATE + reimport"
1229 " of the table.";
1230
1231 /* This mismatch could cause further problems
1232 if not attended, bring this to the user attention
1233 by printing a warning in addition to log a message
1234 in the errorlog. */
1235
1236 push_warning_printf(thd, Sql_condition::SL_WARNING,
1237 ER_NO_SUCH_INDEX,
1238 "InnoDB: Table %s has no"
1239 " primary key in InnoDB data"
1240 " dictionary, but has one in"
1241 " MySQL!", name);
1242 }
1243
1244 m_prebuilt->clust_index_was_generated = TRUE;
1245
1246 ref_length = DATA_ROW_ID_LEN;
1247 ref_length += PARTITION_BYTES_IN_POS;
1248
1249 /* If we automatically created the clustered index, then
1250 MySQL does not know about it, and MySQL must NOT be aware
1251 of the index used on scan, to make it avoid checking if we
1252 update the column of the index. That is why we assert below
1253 that key_used_on_scan is the undefined value MAX_KEY.
1254 The column is the row id in the automatical generation case,
1255 and it will never be updated anyway. */
1256
1257 if (key_used_on_scan != MAX_KEY) {
1258 table_name_t table_name;
1259 table_name.m_name = const_cast<char*>(name);
1260 ib::warn() << "Table " << table_name
1261 << " key_used_on_scan is "
1262 << key_used_on_scan << " even though there is"
1263 " no primary key inside InnoDB.";
1264 }
1265 }
1266
1267 /* Index block size in InnoDB: used by MySQL in query optimization. */
1268 stats.block_size = UNIV_PAGE_SIZE;
1269
1270 if (m_prebuilt->table != NULL) {
1271 /* We update the highest file format in the system table
1272 space, if this table has higher file format setting. */
1273
1274 trx_sys_file_format_max_upgrade(
1275 (const char**) &innobase_file_format_max,
1276 dict_table_get_format(m_prebuilt->table));
1277 }
1278
1279 /* Only if the table has an AUTOINC column. */
1280 if (m_prebuilt->table != NULL
1281 && !m_prebuilt->table->file_unreadable
1282 && table->found_next_number_field != NULL) {
1283 int error;
1284
1285 /* Since a table can already be "open" in InnoDB's internal
1286 data dictionary, we only init the autoinc counter once, the
1287 first time the table is loaded,
1288 see ha_innopart::initialize_auto_increment.
1289 We can safely reuse the autoinc value from a previous MySQL
1290 open. */
1291
1292 lock_auto_increment();
1293 error = initialize_auto_increment(false);
1294 unlock_auto_increment();
1295 if (error != 0) {
1296 close();
1297 DBUG_RETURN(error);
1298 }
1299 }
1300
1301 #ifdef HA_INNOPART_SUPPORTS_FULLTEXT
1302 /* Set plugin parser for fulltext index. */
1303 for (uint i = 0; i < table->s->keys; i++) {
1304 if (table->key_info[i].flags & HA_USES_PARSER) {
1305 dict_index_t* index = innobase_get_index(i);
1306 plugin_ref parser = table->key_info[i].parser;
1307
1308 ut_ad(index->type & DICT_FTS);
1309 index->parser =
1310 static_cast<st_mysql_ftparser *>(
1311 plugin_decl(parser)->info);
1312
1313 DBUG_EXECUTE_IF("fts_instrument_use_default_parser",
1314 index->parser = &fts_default_parser;);
1315 }
1316 }
1317 #endif /* HA_INNOPART_SUPPORTS_FULLTEXT */
1318
1319 size_t alloc_size = sizeof(*m_ins_node_parts) * m_tot_parts;
1320 m_ins_node_parts = static_cast<ins_node_t**>(
1321 ut_zalloc(alloc_size, mem_key_partitioning));
1322
1323 alloc_size = sizeof(*m_upd_node_parts) * m_tot_parts;
1324 m_upd_node_parts = static_cast<upd_node_t**>(
1325 ut_zalloc(alloc_size, mem_key_partitioning));
1326
1327 alloc_blob_heap_array();
1328
1329 alloc_size = sizeof(*m_trx_id_parts) * m_tot_parts;
1330 m_trx_id_parts = static_cast<trx_id_t*>(
1331 ut_zalloc(alloc_size, mem_key_partitioning));
1332
1333 alloc_size = sizeof(*m_row_read_type_parts) * m_tot_parts;
1334 m_row_read_type_parts = static_cast<ulint*>(
1335 ut_zalloc(alloc_size, mem_key_partitioning));
1336
1337 alloc_size = UT_BITS_IN_BYTES(m_tot_parts);
1338 m_sql_stat_start_parts = static_cast<uchar*>(
1339 ut_zalloc(alloc_size, mem_key_partitioning));
1340 if (m_ins_node_parts == NULL
1341 || m_upd_node_parts == NULL
1342 || m_blob_heap_parts == NULL
1343 || m_trx_id_parts == NULL
1344 || m_row_read_type_parts == NULL
1345 || m_sql_stat_start_parts == NULL) {
1346 close(); // Frees all the above.
1347 DBUG_RETURN(HA_ERR_OUT_OF_MEM);
1348 }
1349 info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
1350
1351 DBUG_RETURN(0);
1352 }
1353
1354 /** Get a cloned ha_innopart handler.
1355 @param[in] name Table name.
1356 @param[in] mem_root MySQL mem_root to use.
1357 @return new ha_innopart handler. */
1358 handler*
clone(const char * name,MEM_ROOT * mem_root)1359 ha_innopart::clone(
1360 const char* name,
1361 MEM_ROOT* mem_root)
1362 {
1363 ha_innopart* new_handler;
1364
1365 DBUG_ENTER("ha_innopart::clone");
1366
1367 new_handler = dynamic_cast<ha_innopart*>(handler::clone(name,
1368 mem_root));
1369 if (new_handler != NULL) {
1370 ut_ad(new_handler->m_prebuilt != NULL);
1371
1372 new_handler->m_prebuilt->select_lock_type =
1373 m_prebuilt->select_lock_type;
1374 }
1375
1376 DBUG_RETURN(new_handler);
1377 }
1378
1379 /** Clear used ins_nodes and upd_nodes. */
clear_ins_upd_nodes()1380 void ha_innopart::clear_ins_upd_nodes()
1381 {
1382 /* Free memory from insert nodes. */
1383 if (m_ins_node_parts != NULL) {
1384 for (uint i = 0; i < m_tot_parts; i++) {
1385 if (m_ins_node_parts[i] != NULL) {
1386 ins_node_t* ins = m_ins_node_parts[i];
1387 if (ins->select != NULL) {
1388 que_graph_free_recursive(ins->select);
1389 ins->select = NULL;
1390 }
1391
1392 if (ins->entry_sys_heap != NULL) {
1393 mem_heap_free(ins->entry_sys_heap);
1394 ins->entry_sys_heap = NULL;
1395 }
1396 m_ins_node_parts[i] = NULL;
1397 }
1398 }
1399 }
1400
1401 /* Free memory from update nodes. */
1402 if (m_upd_node_parts != NULL) {
1403 for (uint i = 0; i < m_tot_parts; i++) {
1404 if (m_upd_node_parts[i] != NULL) {
1405 upd_node_t* upd = m_upd_node_parts[i];
1406 if (upd->cascade_heap) {
1407 mem_heap_free(upd->cascade_heap);
1408 upd->cascade_heap = NULL;
1409 }
1410 if (upd->in_mysql_interface) {
1411 btr_pcur_free_for_mysql(upd->pcur);
1412 upd->in_mysql_interface = FALSE;
1413 }
1414
1415 if (upd->select != NULL) {
1416 que_graph_free_recursive(upd->select);
1417 upd->select = NULL;
1418 }
1419 if (upd->heap != NULL) {
1420 mem_heap_free(upd->heap);
1421 upd->heap = NULL;
1422 }
1423 m_upd_node_parts[i] = NULL;
1424 }
1425 }
1426 }
1427 }
1428
1429 /** Closes a handle to an InnoDB table.
1430 @return 0 */
1431 int
close()1432 ha_innopart::close()
1433 {
1434 THD* thd;
1435
1436 DBUG_ENTER("ha_innopart::close");
1437
1438 thd = ha_thd();
1439 if (thd != NULL) {
1440 innobase_release_temporary_latches(ht, thd);
1441 }
1442
1443 ut_ad(m_pcur_parts == NULL);
1444 ut_ad(m_clust_pcur_parts == NULL);
1445 close_partitioning();
1446
1447 ut_ad(m_part_share != NULL);
1448 if (m_part_share != NULL) {
1449 lock_shared_ha_data();
1450 m_part_share->close_table_parts();
1451 unlock_shared_ha_data();
1452 m_part_share = NULL;
1453 }
1454 clear_ins_upd_nodes();
1455 free_blob_heap_array();
1456
1457 /* Prevent double close of m_prebuilt->table. The real one was done
1458 done in m_part_share->close_table_parts(). */
1459 if (m_prebuilt != NULL) {
1460 m_prebuilt->table = NULL;
1461 row_prebuilt_free(m_prebuilt, FALSE);
1462 }
1463
1464 if (m_upd_buf != NULL) {
1465 ut_ad(m_upd_buf_size != 0);
1466 /* Allocated with my_malloc! */
1467 my_free(m_upd_buf);
1468 m_upd_buf = NULL;
1469 m_upd_buf_size = 0;
1470 }
1471
1472 if (m_ins_node_parts != NULL) {
1473 ut_free(m_ins_node_parts);
1474 m_ins_node_parts = NULL;
1475 }
1476 if (m_upd_node_parts != NULL) {
1477 ut_free(m_upd_node_parts);
1478 m_upd_node_parts = NULL;
1479 }
1480 if (m_trx_id_parts != NULL) {
1481 ut_free(m_trx_id_parts);
1482 m_trx_id_parts = NULL;
1483 }
1484 if (m_row_read_type_parts != NULL) {
1485 ut_free(m_row_read_type_parts);
1486 m_row_read_type_parts = NULL;
1487 }
1488 if (m_sql_stat_start_parts != NULL) {
1489 ut_free(m_sql_stat_start_parts);
1490 m_sql_stat_start_parts = NULL;
1491 }
1492
1493 MONITOR_INC(MONITOR_TABLE_CLOSE);
1494
1495 /* Tell InnoDB server that there might be work for
1496 utility threads: */
1497
1498 srv_active_wake_master_thread();
1499
1500 DBUG_RETURN(0);
1501 }
1502
1503 /** Change active partition.
1504 Copies needed info into m_prebuilt from the partition specific memory.
1505 @param[in] part_id Partition to set as active. */
1506 void
set_partition(uint part_id)1507 ha_innopart::set_partition(
1508 uint part_id)
1509 {
1510 DBUG_ENTER("ha_innopart::set_partition");
1511
1512 DBUG_PRINT("ha_innopart", ("partition id: %u", part_id));
1513
1514 if (part_id >= m_tot_parts) {
1515 ut_ad(0);
1516 DBUG_VOID_RETURN;
1517 }
1518 if (m_pcur_parts != NULL) {
1519 m_prebuilt->pcur = &m_pcur_parts[m_pcur_map[part_id]];
1520 }
1521 if (m_clust_pcur_parts != NULL) {
1522 m_prebuilt->clust_pcur =
1523 &m_clust_pcur_parts[m_pcur_map[part_id]];
1524 }
1525 m_prebuilt->ins_node = m_ins_node_parts[part_id];
1526 m_prebuilt->upd_node = m_upd_node_parts[part_id];
1527
1528 /* For unordered scan and table scan, use blob_heap from first
1529 partition as we need exactly one blob. */
1530 m_prebuilt->blob_heap = m_blob_heap_parts[m_ordered ? part_id : 0];
1531
1532 #ifdef UNIV_DEBUG
1533 if (m_prebuilt->blob_heap != NULL) {
1534 DBUG_PRINT("ha_innopart", ("validating blob_heap: %p",
1535 m_prebuilt->blob_heap));
1536 mem_heap_validate(m_prebuilt->blob_heap);
1537 }
1538 #endif
1539
1540 m_prebuilt->trx_id = m_trx_id_parts[part_id];
1541 m_prebuilt->row_read_type = m_row_read_type_parts[part_id];
1542 m_prebuilt->sql_stat_start = get_bit(m_sql_stat_start_parts, part_id);
1543 m_prebuilt->table = m_part_share->get_table_part(part_id);
1544 m_prebuilt->index = innopart_get_index(part_id, active_index);
1545
1546 DBUG_VOID_RETURN;
1547 }
1548
1549 /** Update active partition.
1550 Copies needed info from m_prebuilt into the partition specific memory.
1551 @param[in] part_id Partition to set as active. */
1552 void
update_partition(uint part_id)1553 ha_innopart::update_partition(
1554 uint part_id)
1555 {
1556 DBUG_ENTER("ha_innopart::update_partition");
1557 DBUG_PRINT("ha_innopart", ("partition id: %u", part_id));
1558
1559 if (part_id >= m_tot_parts) {
1560 ut_ad(0);
1561 DBUG_VOID_RETURN;
1562 }
1563 m_ins_node_parts[part_id] = m_prebuilt->ins_node;
1564 m_upd_node_parts[part_id] = m_prebuilt->upd_node;
1565
1566 #ifdef UNIV_DEBUG
1567 if (m_prebuilt->blob_heap != NULL) {
1568 DBUG_PRINT("ha_innopart", ("validating blob_heap: %p",
1569 m_prebuilt->blob_heap));
1570 mem_heap_validate(m_prebuilt->blob_heap);
1571 }
1572 #endif
1573
1574 /* For unordered scan and table scan, use blob_heap from first
1575 partition as we need exactly one blob anytime. */
1576 m_blob_heap_parts[m_ordered ? part_id : 0] = m_prebuilt->blob_heap;
1577
1578 m_trx_id_parts[part_id] = m_prebuilt->trx_id;
1579 m_row_read_type_parts[part_id] = m_prebuilt->row_read_type;
1580 if (m_prebuilt->sql_stat_start == 0) {
1581 clear_bit(m_sql_stat_start_parts, part_id);
1582 }
1583 m_last_part = part_id;
1584 DBUG_VOID_RETURN;
1585 }
1586
1587 /** Save currently highest auto increment value.
1588 @param[in] nr Auto increment value to save. */
1589 void
save_auto_increment(ulonglong nr)1590 ha_innopart::save_auto_increment(
1591 ulonglong nr)
1592 {
1593
1594 /* Store it in the shared dictionary of the partition.
1595 TODO: When the new DD is done, store it in the table and make it
1596 persistent! */
1597
1598 dict_table_autoinc_lock(m_prebuilt->table);
1599 dict_table_autoinc_update_if_greater(m_prebuilt->table, nr + 1);
1600 dict_table_autoinc_unlock(m_prebuilt->table);
1601 }
1602
1603 /** Was the last returned row semi consistent read.
1604 In an UPDATE or DELETE, if the row under the cursor was locked by
1605 another transaction, and the engine used an optimistic read of the last
1606 committed row value under the cursor, then the engine returns 1 from
1607 this function. MySQL must NOT try to update this optimistic value. If
1608 the optimistic value does not match the WHERE condition, MySQL can
1609 decide to skip over this row. This can be used to avoid unnecessary
1610 lock waits.
1611
1612 If this method returns true, it will also signal the storage
1613 engine that the next read will be a locking re-read of the row.
1614 @see handler.h and row0mysql.h
1615 @return true if last read was semi consistent else false. */
1616 bool
was_semi_consistent_read()1617 ha_innopart::was_semi_consistent_read()
1618 {
1619 return(m_row_read_type_parts[m_last_part]
1620 == ROW_READ_DID_SEMI_CONSISTENT);
1621 }
1622
1623 /** Try semi consistent read.
1624 Tell the engine whether it should avoid unnecessary lock waits.
1625 If yes, in an UPDATE or DELETE, if the row under the cursor was locked
1626 by another transaction, the engine may try an optimistic read of
1627 the last committed row value under the cursor.
1628 @see handler.h and row0mysql.h
1629 @param[in] yes Should semi-consistent read be used. */
1630 void
try_semi_consistent_read(bool yes)1631 ha_innopart::try_semi_consistent_read(
1632 bool yes)
1633 {
1634 ha_innobase::try_semi_consistent_read(yes);
1635 for (uint i = m_part_info->get_first_used_partition();
1636 i < m_tot_parts;
1637 i = m_part_info->get_next_used_partition(i)) {
1638
1639 m_row_read_type_parts[i] = m_prebuilt->row_read_type;
1640 }
1641 }
1642
1643 /** Removes a lock on a row.
1644 Removes a new lock set on a row, if it was not read optimistically.
1645 This can be called after a row has been read in the processing of
1646 an UPDATE or a DELETE query. @see ha_innobase::unlock_row(). */
1647 void
unlock_row()1648 ha_innopart::unlock_row()
1649 {
1650 ut_ad(m_last_part < m_tot_parts);
1651 set_partition(m_last_part);
1652 ha_innobase::unlock_row();
1653 update_partition(m_last_part);
1654 }
1655
1656 /** Write a row in partition.
1657 Stores a row in an InnoDB database, to the table specified in this
1658 handle.
1659 @param[in] part_id Partition to write to.
1660 @param[in] record A row in MySQL format.
1661 @return 0 or error code. */
1662 int
write_row_in_part(uint part_id,uchar * record)1663 ha_innopart::write_row_in_part(
1664 uint part_id,
1665 uchar* record)
1666 {
1667 int error;
1668 Field* saved_next_number_field = table->next_number_field;
1669 DBUG_ENTER("ha_innopart::write_row_in_part");
1670 set_partition(part_id);
1671
1672 /* Prevent update_auto_increment to be called
1673 again in ha_innobase::write_row(). */
1674
1675 table->next_number_field = NULL;
1676
1677 /* TODO: try to avoid creating a new dtuple
1678 (in row_get_prebuilt_insert_row()) for each partition).
1679 Might be needed due to ins_node implementation. */
1680
1681 error = ha_innobase::write_row(record);
1682 update_partition(part_id);
1683 table->next_number_field = saved_next_number_field;
1684 DBUG_RETURN(error);
1685 }
1686
1687 /** Update a row in partition.
1688 Updates a row given as a parameter to a new value.
1689 @param[in] part_id Partition to update row in.
1690 @param[in] old_row Old row in MySQL format.
1691 @param[in] new_row New row in MySQL format.
1692 @return 0 or error number. */
1693 int
update_row_in_part(uint part_id,const uchar * old_row,uchar * new_row)1694 ha_innopart::update_row_in_part(
1695 uint part_id,
1696 const uchar* old_row,
1697 uchar* new_row)
1698 {
1699 int error;
1700 DBUG_ENTER("ha_innopart::update_row_in_part");
1701
1702 set_partition(part_id);
1703 error = ha_innobase::update_row(old_row, new_row);
1704 update_partition(part_id);
1705 DBUG_RETURN(error);
1706 }
1707
1708 /** Deletes a row in partition.
1709 @param[in] part_id Partition to delete from.
1710 @param[in] record Row to delete in MySQL format.
1711 @return 0 or error number. */
1712 int
delete_row_in_part(uint part_id,const uchar * record)1713 ha_innopart::delete_row_in_part(
1714 uint part_id,
1715 const uchar* record)
1716 {
1717 int error;
1718 DBUG_ENTER("ha_innopart::delete_row_in_part");
1719 m_err_rec = NULL;
1720
1721 m_last_part = part_id;
1722 set_partition(part_id);
1723 error = ha_innobase::delete_row(record);
1724 update_partition(part_id);
1725 DBUG_RETURN(error);
1726 }
1727
1728 /** Initializes a handle to use an index.
1729 @param[in] keynr Key (index) number.
1730 @param[in] sorted True if result MUST be sorted according to index.
1731 @return 0 or error number. */
1732 int
index_init(uint keynr,bool sorted)1733 ha_innopart::index_init(
1734 uint keynr,
1735 bool sorted)
1736 {
1737 int error;
1738 uint part_id = m_part_info->get_first_used_partition();
1739 DBUG_ENTER("ha_innopart::index_init");
1740
1741 active_index = keynr;
1742 if (part_id == MY_BIT_NONE) {
1743 DBUG_RETURN(0);
1744 }
1745
1746 error = ph_index_init_setup(keynr, sorted);
1747 if (error != 0) {
1748 DBUG_RETURN(error);
1749 }
1750
1751 if (sorted) {
1752 error = init_record_priority_queue();
1753 if (error != 0) {
1754 /* Needs cleanup in case it returns error. */
1755 destroy_record_priority_queue();
1756 DBUG_RETURN(error);
1757 }
1758 /* Disable prefetch.
1759 The prefetch buffer is not partitioning aware, so it may return
1760 rows from a different partition if either the prefetch buffer is
1761 full, or it is non-empty and the partition is exhausted. */
1762 m_prebuilt->m_no_prefetch = true;
1763 }
1764
1765 /* For scan across partitions, the keys needs to be materialized */
1766 m_prebuilt->m_read_virtual_key = true;
1767
1768 error = change_active_index(part_id, keynr);
1769 if (error != 0) {
1770 destroy_record_priority_queue();
1771 DBUG_RETURN(error);
1772 }
1773
1774 DBUG_EXECUTE_IF("partition_fail_index_init", {
1775 destroy_record_priority_queue();
1776 DBUG_RETURN(HA_ERR_NO_PARTITION_FOUND);
1777 });
1778
1779 DBUG_RETURN(0);
1780 }
1781
1782 /** End index cursor.
1783 @return 0 or error code. */
1784 int
index_end()1785 ha_innopart::index_end()
1786 {
1787 uint part_id = m_part_info->get_first_used_partition();
1788 DBUG_ENTER("ha_innopart::index_end");
1789
1790 if (part_id == MY_BIT_NONE) {
1791 /* Never initialized any index. */
1792 active_index = MAX_KEY;
1793 DBUG_RETURN(0);
1794 }
1795 if (m_ordered) {
1796 destroy_record_priority_queue();
1797 m_prebuilt->m_no_prefetch = false;
1798 }
1799 m_prebuilt->m_read_virtual_key = false;
1800
1801 DBUG_RETURN(ha_innobase::index_end());
1802 }
1803
1804 /* Partitioning support functions. */
1805
1806 /** Setup the ordered record buffer and the priority queue.
1807 @param[in] used_parts Number of used partitions in query.
1808 @return false for success else true. */
1809 int
init_record_priority_queue_for_parts(uint used_parts)1810 ha_innopart::init_record_priority_queue_for_parts(
1811 uint used_parts)
1812 {
1813 size_t alloc_size;
1814 void* buf;
1815
1816 DBUG_ENTER("ha_innopart::init_record_priority_queue_for_parts");
1817 ut_ad(used_parts >= 1);
1818 /* TODO: Don't use this if only one partition is used! */
1819 //ut_ad(used_parts > 1);
1820
1821 /* We could reuse current m_prebuilt->pcur/clust_pcur for the first
1822 used partition, but it would complicate and affect performance,
1823 so we trade some extra memory instead. */
1824
1825 m_pcur = m_prebuilt->pcur;
1826 m_clust_pcur = m_prebuilt->clust_pcur;
1827
1828 /* If we searching for secondary key or doing a write/update
1829 we will need two pcur, one for the active (secondary) index and
1830 one for the clustered index. */
1831
1832 bool need_clust_index =
1833 m_curr_key_info[1] != NULL
1834 || get_lock_type() != F_RDLCK;
1835
1836 /* pcur and clust_pcur per partition.
1837 By using zalloc, we do not need to initialize the pcur's! */
1838
1839 alloc_size = used_parts * sizeof(btr_pcur_t);
1840 if (need_clust_index) {
1841 alloc_size *= 2;
1842 }
1843 buf = ut_zalloc(alloc_size, mem_key_partitioning);
1844 if (buf == NULL) {
1845 DBUG_RETURN(true);
1846 }
1847 m_pcur_parts = static_cast<btr_pcur_t*>(buf);
1848 if (need_clust_index) {
1849 m_clust_pcur_parts = &m_pcur_parts[used_parts];
1850 }
1851 /* mapping from part_id to pcur. */
1852 alloc_size = m_tot_parts * sizeof(*m_pcur_map);
1853 buf = ut_zalloc(alloc_size, mem_key_partitioning);
1854 if (buf == NULL) {
1855 DBUG_RETURN(true);
1856 }
1857 m_pcur_map = static_cast<uint16_t*>(buf);
1858 {
1859 uint16_t pcur_count = 0;
1860 for (uint i = m_part_info->get_first_used_partition();
1861 i < m_tot_parts;
1862 i = m_part_info->get_next_used_partition(i)) {
1863 m_pcur_map[i] = pcur_count++;
1864 }
1865 }
1866
1867 DBUG_RETURN(false);
1868 }
1869
1870 /** Destroy the ordered record buffer and the priority queue. */
1871 inline
1872 void
destroy_record_priority_queue_for_parts()1873 ha_innopart::destroy_record_priority_queue_for_parts()
1874 {
1875 DBUG_ENTER("ha_innopart::destroy_record_priority_queue");
1876 if (m_pcur_parts != NULL) {
1877 uint used_parts;
1878 used_parts = bitmap_bits_set(&m_part_info->read_partitions);
1879 for (uint i = 0; i < used_parts; i++) {
1880 btr_pcur_free(&m_pcur_parts[i]);
1881 if (m_clust_pcur_parts != NULL) {
1882 btr_pcur_free(&m_clust_pcur_parts[i]);
1883 }
1884 }
1885 ut_free(m_pcur_parts);
1886 m_clust_pcur_parts = NULL;
1887 m_pcur_parts = NULL;
1888 /* Reset the original m_prebuilt->pcur. */
1889 m_prebuilt->pcur = m_pcur;
1890 m_prebuilt->clust_pcur = m_clust_pcur;
1891 }
1892 if (m_pcur_map != NULL) {
1893 ut_free(m_pcur_map);
1894 m_pcur_map = NULL;
1895 }
1896 DBUG_VOID_RETURN;
1897 }
1898
1899 /** Print error information.
1900 @param[in] error Error code (MySQL).
1901 @param[in] errflag Flags. */
1902 void
print_error(int error,myf errflag)1903 ha_innopart::print_error(
1904 int error,
1905 myf errflag)
1906 {
1907 DBUG_ENTER("ha_innopart::print_error");
1908 if (print_partition_error(error, errflag)) {
1909 ha_innobase::print_error(error, errflag);
1910 }
1911
1912 DBUG_VOID_RETURN;
1913 }
1914
1915 /** Can error be ignored.
1916 @param[in] error Error code to check.
1917 @return true if ignorable else false. */
1918 bool
is_ignorable_error(int error)1919 ha_innopart::is_ignorable_error(
1920 int error)
1921 {
1922 if (ha_innobase::is_ignorable_error(error)
1923 || error == HA_ERR_NO_PARTITION_FOUND
1924 || error == HA_ERR_NOT_IN_LOCK_PARTITIONS) {
1925
1926 return(true);
1927 }
1928 return(false);
1929 }
1930
1931 /** Get the index for the current partition
1932 @param[in] keynr MySQL index number.
1933 @return InnoDB index or NULL. */
1934 inline
1935 dict_index_t*
innobase_get_index(uint keynr)1936 ha_innopart::innobase_get_index(
1937 uint keynr)
1938 {
1939 uint part_id = m_last_part;
1940 if (part_id >= m_tot_parts) {
1941 ut_ad(0);
1942 part_id = 0;
1943 }
1944 return(innopart_get_index(part_id, keynr));
1945 }
1946
1947 /** Get the index for a handle.
1948 Does not change active index.
1949 @param[in] keynr Use this index; MAX_KEY means always clustered index,
1950 even if it was internally generated by InnoDB.
1951 @param[in] part_id From this partition.
1952 @return NULL or index instance. */
1953 inline
1954 dict_index_t*
innopart_get_index(uint part_id,uint keynr)1955 ha_innopart::innopart_get_index(
1956 uint part_id,
1957 uint keynr)
1958 {
1959 KEY* key = NULL;
1960 dict_index_t* index = NULL;
1961
1962 DBUG_ENTER("innopart_get_index");
1963
1964 if (keynr != MAX_KEY && table->s->keys > 0) {
1965 key = table->key_info + keynr;
1966
1967 index = m_part_share->get_index(part_id, keynr);
1968
1969 if (index != NULL) {
1970 ut_a(ut_strcmp(index->name, key->name) == 0);
1971 } else {
1972 /* Can't find index with keynr in the translation
1973 table. Only print message if the index translation
1974 table exists. */
1975
1976 ib::warn() << "InnoDB could not find index "
1977 << (key ? key->name : "NULL")
1978 << " key no " << keynr << " for table "
1979 << m_prebuilt->table->name
1980 << " through its index translation table";
1981
1982 index = dict_table_get_index_on_name(m_prebuilt->table,
1983 key->name);
1984 }
1985 } else {
1986 /* Get the generated index. */
1987 ut_ad(keynr == MAX_KEY);
1988 index = dict_table_get_first_index(
1989 m_part_share->get_table_part(part_id));
1990 }
1991
1992 if (index == NULL) {
1993 ib::error() << "InnoDB could not find key n:o "
1994 << keynr << " with name " << (key ? key->name : "NULL")
1995 << " from dict cache for table "
1996 << m_prebuilt->table->name << " partition n:o "
1997 << part_id;
1998 }
1999
2000 DBUG_RETURN(index);
2001 }
2002
2003 /** Changes the active index of a handle.
2004 @param[in] part_id Use this partition.
2005 @param[in] keynr Use this index; MAX_KEY means always clustered index,
2006 even if it was internally generated by InnoDB.
2007 @return 0 or error number. */
2008 int
change_active_index(uint part_id,uint keynr)2009 ha_innopart::change_active_index(
2010 uint part_id,
2011 uint keynr)
2012 {
2013 DBUG_ENTER("ha_innopart::change_active_index");
2014
2015 ut_ad(m_user_thd == ha_thd());
2016 ut_a(m_prebuilt->trx == thd_to_trx(m_user_thd));
2017
2018 active_index = keynr;
2019 set_partition(part_id);
2020
2021 if (UNIV_UNLIKELY(m_prebuilt->index == NULL)) {
2022 ib::warn() << "change_active_index(" << part_id
2023 << "," << keynr << ") failed";
2024 m_prebuilt->index_usable = FALSE;
2025 DBUG_RETURN(1);
2026 }
2027
2028 m_prebuilt->index_usable = row_merge_is_index_usable(m_prebuilt->trx,
2029 m_prebuilt->index);
2030
2031 if (UNIV_UNLIKELY(!m_prebuilt->index_usable)) {
2032 if (dict_index_is_corrupted(m_prebuilt->index)) {
2033 char table_name[MAX_FULL_NAME_LEN + 1];
2034
2035 innobase_format_name(
2036 table_name, sizeof table_name,
2037 m_prebuilt->index->table->name.m_name);
2038
2039 push_warning_printf(
2040 m_user_thd, Sql_condition::SL_WARNING,
2041 HA_ERR_INDEX_CORRUPT,
2042 "InnoDB: Index %s for table %s is"
2043 " marked as corrupted"
2044 " (partition %u)",
2045 m_prebuilt->index->name(), table_name, part_id);
2046 DBUG_RETURN(HA_ERR_INDEX_CORRUPT);
2047 } else {
2048 push_warning_printf(
2049 m_user_thd, Sql_condition::SL_WARNING,
2050 HA_ERR_TABLE_DEF_CHANGED,
2051 "InnoDB: insufficient history for index %u",
2052 keynr);
2053 }
2054
2055 /* The caller seems to ignore this. Thus, we must check
2056 this again in row_search_for_mysql(). */
2057
2058 DBUG_RETURN(HA_ERR_TABLE_DEF_CHANGED);
2059 }
2060
2061 ut_a(m_prebuilt->search_tuple != NULL);
2062
2063 /* If too expensive, cache the keynr and only update search_tuple when
2064 keynr changes. Remember that the clustered index is also used for
2065 MAX_KEY. */
2066 dtuple_set_n_fields(m_prebuilt->search_tuple,
2067 m_prebuilt->index->n_fields);
2068
2069 dict_index_copy_types(m_prebuilt->search_tuple, m_prebuilt->index,
2070 m_prebuilt->index->n_fields);
2071
2072 /* MySQL changes the active index for a handle also during some
2073 queries, for example SELECT MAX(a), SUM(a) first retrieves the
2074 MAX() and then calculates the sum. Previously we played safe
2075 and used the flag ROW_MYSQL_WHOLE_ROW below, but that caused
2076 unnecessary copying. Starting from MySQL-4.1 we use a more
2077 efficient flag here. */
2078
2079 /* TODO: Is this really needed?
2080 Will it not be built in index_read? */
2081
2082 build_template(false);
2083
2084 DBUG_RETURN(0);
2085 }
2086
2087 /** Return first record in index from a partition.
2088 @param[in] part Partition to read from.
2089 @param[out] record First record in index in the partition.
2090 @return error number or 0. */
2091 int
index_first_in_part(uint part,uchar * record)2092 ha_innopart::index_first_in_part(
2093 uint part,
2094 uchar* record)
2095 {
2096 int error;
2097 DBUG_ENTER("ha_innopart::index_first_in_part");
2098
2099 set_partition(part);
2100 error = ha_innobase::index_first(record);
2101 update_partition(part);
2102
2103 DBUG_RETURN(error);
2104 }
2105
2106 /** Return next record in index from a partition.
2107 @param[in] part Partition to read from.
2108 @param[out] record Last record in index in the partition.
2109 @return error number or 0. */
2110 int
index_next_in_part(uint part,uchar * record)2111 ha_innopart::index_next_in_part(
2112 uint part,
2113 uchar* record)
2114 {
2115 DBUG_ENTER("ha_innopart::index_next_in_part");
2116
2117 int error;
2118
2119 set_partition(part);
2120 error = ha_innobase::index_next(record);
2121 update_partition(part);
2122
2123 ut_ad(m_ordered_scan_ongoing
2124 || m_ordered_rec_buffer == NULL
2125 || m_prebuilt->used_in_HANDLER
2126 || m_part_spec.start_part >= m_part_spec.end_part);
2127
2128 DBUG_RETURN(error);
2129 }
2130
2131 /** Return next same record in index from a partition.
2132 This routine is used to read the next record, but only if the key is
2133 the same as supplied in the call.
2134 @param[in] part Partition to read from.
2135 @param[out] record Last record in index in the partition.
2136 @param[in] key Key to match.
2137 @param[in] length Length of key.
2138 @return error number or 0. */
2139 int
index_next_same_in_part(uint part,uchar * record,const uchar * key,uint length)2140 ha_innopart::index_next_same_in_part(
2141 uint part,
2142 uchar* record,
2143 const uchar* key,
2144 uint length)
2145 {
2146 int error;
2147
2148 set_partition(part);
2149 error = ha_innobase::index_next_same(record, key, length);
2150 update_partition(part);
2151 return(error);
2152 }
2153
2154 /** Return last record in index from a partition.
2155 @param[in] part Partition to read from.
2156 @param[out] record Last record in index in the partition.
2157 @return error number or 0. */
2158 int
index_last_in_part(uint part,uchar * record)2159 ha_innopart::index_last_in_part(
2160 uint part,
2161 uchar* record)
2162 {
2163 int error;
2164
2165 set_partition(part);
2166 error = ha_innobase::index_last(record);
2167 update_partition(part);
2168 return(error);
2169 }
2170
2171 /** Return previous record in index from a partition.
2172 @param[in] part Partition to read from.
2173 @param[out] record Last record in index in the partition.
2174 @return error number or 0. */
2175 int
index_prev_in_part(uint part,uchar * record)2176 ha_innopart::index_prev_in_part(
2177 uint part,
2178 uchar* record)
2179 {
2180 int error;
2181
2182 set_partition(part);
2183 error = ha_innobase::index_prev(record);
2184 update_partition(part);
2185
2186 ut_ad(m_ordered_scan_ongoing
2187 || m_ordered_rec_buffer == NULL
2188 || m_prebuilt->used_in_HANDLER
2189 || m_part_spec.start_part >= m_part_spec.end_part);
2190
2191 return(error);
2192 }
2193
2194 /** Start index scan and return first record from a partition.
2195 This routine starts an index scan using a start key. The calling
2196 function will check the end key on its own.
2197 @param[in] part Partition to read from.
2198 @param[out] record First matching record in index in the partition.
2199 @param[in] key Key to match.
2200 @param[in] keypart_map Which part of the key to use.
2201 @param[in] find_flag Key condition/direction to use.
2202 @return error number or 0. */
2203 int
index_read_map_in_part(uint part,uchar * record,const uchar * key,key_part_map keypart_map,enum ha_rkey_function find_flag)2204 ha_innopart::index_read_map_in_part(
2205 uint part,
2206 uchar* record,
2207 const uchar* key,
2208 key_part_map keypart_map,
2209 enum ha_rkey_function find_flag)
2210 {
2211 int error;
2212
2213 set_partition(part);
2214 error = ha_innobase::index_read_map(
2215 record,
2216 key,
2217 keypart_map,
2218 find_flag);
2219 update_partition(part);
2220 return(error);
2221 }
2222
2223 /** Start index scan and return first record from a partition.
2224 This routine starts an index scan using a start key. The calling
2225 function will check the end key on its own.
2226 @param[in] part Partition to read from.
2227 @param[out] record First matching record in index in the partition.
2228 @param[in] index Index to read from.
2229 @param[in] key Key to match.
2230 @param[in] keypart_map Which part of the key to use.
2231 @param[in] find_flag Key condition/direction to use.
2232 @return error number or 0. */
2233 int
index_read_idx_map_in_part(uint part,uchar * record,uint index,const uchar * key,key_part_map keypart_map,enum ha_rkey_function find_flag)2234 ha_innopart::index_read_idx_map_in_part(
2235 uint part,
2236 uchar* record,
2237 uint index,
2238 const uchar* key,
2239 key_part_map keypart_map,
2240 enum ha_rkey_function find_flag)
2241 {
2242 int error;
2243
2244 set_partition(part);
2245 error = ha_innobase::index_read_idx_map(
2246 record,
2247 index,
2248 key,
2249 keypart_map,
2250 find_flag);
2251 update_partition(part);
2252 return(error);
2253 }
2254
2255 /** Return last matching record in index from a partition.
2256 @param[in] part Partition to read from.
2257 @param[out] record Last matching record in index in the partition.
2258 @param[in] key Key to match.
2259 @param[in] keypart_map Which part of the key to use.
2260 @return error number or 0. */
2261 int
index_read_last_map_in_part(uint part,uchar * record,const uchar * key,key_part_map keypart_map)2262 ha_innopart::index_read_last_map_in_part(
2263 uint part,
2264 uchar* record,
2265 const uchar* key,
2266 key_part_map keypart_map)
2267 {
2268 int error;
2269 set_partition(part);
2270 error = ha_innobase::index_read_last_map(record, key, keypart_map);
2271 update_partition(part);
2272 return(error);
2273 }
2274
2275 /** Start index scan and return first record from a partition.
2276 This routine starts an index scan using a start and end key.
2277 @param[in] part Partition to read from.
2278 @param[in,out] record First matching record in index in the partition,
2279 if NULL use table->record[0] as return buffer.
2280 @param[in] start_key Start key to match.
2281 @param[in] end_key End key to match.
2282 @param[in] eq_range Is equal range, start_key == end_key.
2283 @param[in] sorted Return rows in sorted order.
2284 @return error number or 0. */
2285 int
read_range_first_in_part(uint part,uchar * record,const key_range * start_key,const key_range * end_key,bool eq_range,bool sorted)2286 ha_innopart::read_range_first_in_part(
2287 uint part,
2288 uchar* record,
2289 const key_range* start_key,
2290 const key_range* end_key,
2291 bool eq_range,
2292 bool sorted)
2293 {
2294 int error;
2295 uchar* read_record = record;
2296 set_partition(part);
2297 if (read_record == NULL) {
2298 read_record = table->record[0];
2299 }
2300 if (m_start_key.key != NULL) {
2301 error = ha_innobase::index_read(
2302 read_record,
2303 m_start_key.key,
2304 m_start_key.length,
2305 m_start_key.flag);
2306 } else {
2307 error = ha_innobase::index_first(read_record);
2308 }
2309 if (error == HA_ERR_KEY_NOT_FOUND) {
2310 error = HA_ERR_END_OF_FILE;
2311 } else if (error == 0 && !in_range_check_pushed_down) {
2312 /* compare_key uses table->record[0], so we
2313 need to copy the data if not already there. */
2314
2315 if (record != NULL) {
2316 copy_cached_row(table->record[0], read_record);
2317 }
2318 if (compare_key(end_range) > 0) {
2319 /* must use ha_innobase:: due to set/update_partition
2320 could overwrite states if ha_innopart::unlock_row()
2321 was used. */
2322 ha_innobase::unlock_row();
2323 error = HA_ERR_END_OF_FILE;
2324 }
2325 }
2326 update_partition(part);
2327 return(error);
2328 }
2329
2330 /** Return next record in index range scan from a partition.
2331 @param[in] part Partition to read from.
2332 @param[in,out] record First matching record in index in the partition,
2333 if NULL use table->record[0] as return buffer.
2334 @return error number or 0. */
2335 int
read_range_next_in_part(uint part,uchar * record)2336 ha_innopart::read_range_next_in_part(
2337 uint part,
2338 uchar* record)
2339 {
2340 int error;
2341 uchar* read_record = record;
2342
2343 set_partition(part);
2344 if (read_record == NULL) {
2345 read_record = table->record[0];
2346 }
2347
2348 /* TODO: Implement ha_innobase::read_range*?
2349 So it will return HA_ERR_END_OF_FILE or
2350 HA_ERR_KEY_NOT_FOUND when passing end_range. */
2351
2352 error = ha_innobase::index_next(read_record);
2353 if (error == 0 && !in_range_check_pushed_down) {
2354 /* compare_key uses table->record[0], so we
2355 need to copy the data if not already there. */
2356
2357 if (record != NULL) {
2358 copy_cached_row(table->record[0], read_record);
2359 }
2360 if (compare_key(end_range) > 0) {
2361 /* must use ha_innobase:: due to set/update_partition
2362 could overwrite states if ha_innopart::unlock_row()
2363 was used. */
2364 ha_innobase::unlock_row();
2365 error = HA_ERR_END_OF_FILE;
2366 }
2367 }
2368 update_partition(part);
2369
2370 return(error);
2371 }
2372
2373 /** Initialize a table scan in a specific partition.
2374 @param[in] part_id Partition to initialize.
2375 @param[in] scan True if table/index scan false otherwise (for rnd_pos)
2376 @return 0 or error number. */
2377 int
rnd_init_in_part(uint part_id,bool scan)2378 ha_innopart::rnd_init_in_part(
2379 uint part_id,
2380 bool scan)
2381 {
2382 int err;
2383
2384 if (m_prebuilt->clust_index_was_generated) {
2385 err = change_active_index(part_id, MAX_KEY);
2386 } else {
2387 err = change_active_index(part_id, m_primary_key);
2388 }
2389
2390 m_start_of_scan = 1;
2391
2392 /* Don't use semi-consistent read in random row reads (by position).
2393 This means we must disable semi_consistent_read if scan is false. */
2394
2395 if (!scan) {
2396 try_semi_consistent_read(false);
2397 }
2398
2399 return(err);
2400 }
2401
2402 /** Ends a table scan.
2403 @param[in] part_id Partition to end table scan in.
2404 @param[in] scan True for scan else random access.
2405 @return 0 or error number. */
2406 int
rnd_end_in_part(uint part_id,bool scan)2407 ha_innopart::rnd_end_in_part(
2408 uint part_id,
2409 bool scan)
2410 {
2411 return(index_end());
2412 }
2413
2414 /** Read next row in partition.
2415 Reads the next row in a table scan (also used to read the FIRST row
2416 in a table scan).
2417 @param[in] part_id Partition to end table scan in.
2418 @param[out] buf Returns the row in this buffer, in MySQL format.
2419 @return 0, HA_ERR_END_OF_FILE or error number. */
2420 int
rnd_next_in_part(uint part_id,uchar * buf)2421 ha_innopart::rnd_next_in_part(
2422 uint part_id,
2423 uchar* buf)
2424 {
2425 int error;
2426
2427 DBUG_ENTER("ha_innopart::rnd_next_in_part");
2428
2429 set_partition(part_id);
2430 if (m_start_of_scan) {
2431 error = ha_innobase::index_first(buf);
2432
2433 if (error == HA_ERR_KEY_NOT_FOUND) {
2434 error = HA_ERR_END_OF_FILE;
2435 }
2436 m_start_of_scan = 0;
2437 } else {
2438 ha_statistic_increment(&SSV::ha_read_rnd_next_count);
2439 error = ha_innobase::general_fetch(buf, ROW_SEL_NEXT, 0);
2440 }
2441
2442 update_partition(part_id);
2443 DBUG_RETURN(error);
2444 }
2445
2446 /** Get a row from a position.
2447 Fetches a row from the table based on a row reference.
2448 @param[out] buf Returns the row in this buffer, in MySQL format.
2449 @param[in] pos Position, given as primary key value or DB_ROW_ID
2450 (if no primary key) of the row in MySQL format. The length of data in pos has
2451 to be ref_length.
2452 @return 0, HA_ERR_KEY_NOT_FOUND or error code. */
2453 int
rnd_pos(uchar * buf,uchar * pos)2454 ha_innopart::rnd_pos(
2455 uchar* buf,
2456 uchar* pos)
2457 {
2458 int error;
2459 uint part_id;
2460 DBUG_ENTER("ha_innopart::rnd_pos");
2461 ut_ad(PARTITION_BYTES_IN_POS == 2);
2462 DBUG_DUMP("pos", pos, ref_length);
2463
2464 ha_statistic_increment(&SSV::ha_read_rnd_count);
2465
2466 ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
2467
2468 /* Restore used partition. */
2469 part_id = uint2korr(pos);
2470
2471 set_partition(part_id);
2472
2473 /* Note that we assume the length of the row reference is fixed
2474 for the table, and it is == ref_length. */
2475
2476 error = ha_innobase::index_read(buf, pos + PARTITION_BYTES_IN_POS,
2477 ref_length - PARTITION_BYTES_IN_POS,
2478 HA_READ_KEY_EXACT);
2479 DBUG_PRINT("info", ("part %u index_read returned %d", part_id, error));
2480 DBUG_DUMP("buf", buf, table_share->reclength);
2481
2482 update_partition(part_id);
2483
2484 DBUG_RETURN(error);
2485 }
2486
2487 /** Return position for cursor in last used partition.
2488 Stores a reference to the current row to 'ref' field of the handle. Note
2489 that in the case where we have generated the clustered index for the
2490 table, the function parameter is illogical: we MUST ASSUME that 'record'
2491 is the current 'position' of the handle, because if row ref is actually
2492 the row id internally generated in InnoDB, then 'record' does not contain
2493 it. We just guess that the row id must be for the record where the handle
2494 was positioned the last time.
2495 @param[out] ref_arg Pointer to buffer where to write the position.
2496 @param[in] record Record to position for. */
2497 void
position_in_last_part(uchar * ref_arg,const uchar * record)2498 ha_innopart::position_in_last_part(
2499 uchar* ref_arg,
2500 const uchar* record)
2501 {
2502 if (m_prebuilt->clust_index_was_generated) {
2503 /* No primary key was defined for the table and we
2504 generated the clustered index from row id: the
2505 row reference will be the row id, not any key value
2506 that MySQL knows of. */
2507
2508 memcpy(ref_arg, m_prebuilt->row_id, DATA_ROW_ID_LEN);
2509 } else {
2510
2511 /* Copy primary key as the row reference */
2512 KEY* key_info = table->key_info + m_primary_key;
2513 key_copy(ref_arg, (uchar*)record, key_info,
2514 key_info->key_length);
2515 }
2516 }
2517
2518 /** Fill in data_dir_path and tablespace name from internal data
2519 dictionary.
2520 @param part_elem Partition element to fill.
2521 @param ib_table InnoDB table to copy from. */
2522 void
update_part_elem(partition_element * part_elem,dict_table_t * ib_table)2523 ha_innopart::update_part_elem(
2524 partition_element* part_elem,
2525 dict_table_t* ib_table)
2526 {
2527 dict_get_and_save_data_dir_path(ib_table, false);
2528 if (ib_table->data_dir_path != NULL) {
2529 if (part_elem->data_file_name == NULL
2530 || strcmp(ib_table->data_dir_path,
2531 part_elem->data_file_name) != 0) {
2532
2533 /* Play safe and allocate memory from TABLE and copy
2534 instead of expose the internal data dictionary. */
2535 part_elem->data_file_name =
2536 strdup_root(&table->mem_root,
2537 ib_table->data_dir_path);
2538 }
2539 } else {
2540 part_elem->data_file_name = NULL;
2541 }
2542
2543 part_elem->index_file_name = NULL;
2544 dict_get_and_save_space_name(ib_table, false);
2545 if (ib_table->tablespace != NULL) {
2546 ut_ad(part_elem->tablespace_name == NULL
2547 || 0 == strcmp(part_elem->tablespace_name,
2548 ib_table->tablespace));
2549 if (part_elem->tablespace_name == NULL
2550 || strcmp(ib_table->tablespace,
2551 part_elem->tablespace_name) != 0) {
2552
2553 /* Play safe and allocate memory from TABLE and copy
2554 instead of expose the internal data dictionary. */
2555 part_elem->tablespace_name =
2556 strdup_root(&table->mem_root,
2557 ib_table->tablespace);
2558 }
2559 }
2560 else {
2561 ut_ad(part_elem->tablespace_name == NULL
2562 || 0 == strcmp(part_elem->tablespace_name,
2563 "innodb_file_per_table"));
2564 if (part_elem->tablespace_name != NULL
2565 && 0 != strcmp(part_elem->tablespace_name,
2566 "innodb_file_per_table")) {
2567
2568 /* Update part_elem tablespace to NULL same as in
2569 innodb data dictionary ib_table. */
2570 part_elem->tablespace_name = NULL;
2571 }
2572 }
2573 }
2574
2575 /** Update create_info.
2576 Used in SHOW CREATE TABLE et al.
2577 @param[in,out] create_info Create info to update. */
2578 void
update_create_info(HA_CREATE_INFO * create_info)2579 ha_innopart::update_create_info(
2580 HA_CREATE_INFO* create_info)
2581 {
2582 uint num_subparts = m_part_info->num_subparts;
2583 uint num_parts;
2584 uint part;
2585 dict_table_t* table;
2586 List_iterator<partition_element>
2587 part_it(m_part_info->partitions);
2588 partition_element* part_elem;
2589 partition_element* sub_elem;
2590 DBUG_ENTER("ha_innopart::update_create_info");
2591 if ((create_info->used_fields & HA_CREATE_USED_AUTO) == 0) {
2592 info(HA_STATUS_AUTO);
2593 create_info->auto_increment_value = stats.auto_increment_value;
2594 }
2595
2596 num_parts = (num_subparts != 0) ? m_tot_parts / num_subparts : m_tot_parts;
2597
2598 /* DATA/INDEX DIRECTORY are never applied to the whole partitioned
2599 table, only to its parts. */
2600
2601 create_info->data_file_name = NULL;
2602 create_info->index_file_name = NULL;
2603
2604 /* Since update_create_info() can be called from
2605 mysql_prepare_alter_table() when not all partitions are set up,
2606 we look for that condition first.
2607 If all partitions are not available then simply return,
2608 since it does not need any updated partitioning info. */
2609
2610 if (!m_part_info->temp_partitions.is_empty()) {
2611 DBUG_VOID_RETURN;
2612 }
2613 part = 0;
2614 while ((part_elem = part_it++)) {
2615 if (part >= num_parts) {
2616 DBUG_VOID_RETURN;
2617 }
2618 if (m_part_info->is_sub_partitioned()) {
2619 List_iterator<partition_element>
2620 subpart_it(part_elem->subpartitions);
2621 uint subpart = 0;
2622 while ((sub_elem = subpart_it++)) {
2623 if (subpart >= num_subparts) {
2624 DBUG_VOID_RETURN;
2625 }
2626 subpart++;
2627 }
2628 if (subpart != num_subparts) {
2629 DBUG_VOID_RETURN;
2630 }
2631 }
2632 part++;
2633 }
2634 if (part != num_parts) {
2635 DBUG_VOID_RETURN;
2636 }
2637
2638 /* part_elem->data_file_name and tablespace_name should be correct from
2639 the .frm, but may have been changed, so update from SYS_DATAFILES.
2640 index_file_name is ignored, so remove it. */
2641
2642 part = 0;
2643 part_it.rewind();
2644 while ((part_elem = part_it++)) {
2645 if (m_part_info->is_sub_partitioned()) {
2646 List_iterator<partition_element>
2647 subpart_it(part_elem->subpartitions);
2648 while ((sub_elem = subpart_it++)) {
2649 table = m_part_share->get_table_part(part++);
2650 update_part_elem(sub_elem, table);
2651 }
2652 } else {
2653 table = m_part_share->get_table_part(part++);
2654 update_part_elem(part_elem, table);
2655 }
2656 }
2657 DBUG_VOID_RETURN;
2658 }
2659
2660 /** Set create_info->data_file_name.
2661 @param[in] part_elem Partition to copy from.
2662 @param[in,out] info Create info to set. */
2663 static
2664 void
set_create_info_dir(partition_element * part_elem,HA_CREATE_INFO * info)2665 set_create_info_dir(
2666 partition_element* part_elem,
2667 HA_CREATE_INFO* info)
2668 {
2669 if (part_elem->data_file_name != NULL
2670 && part_elem->data_file_name[0] != '\0') {
2671 info->data_file_name = part_elem->data_file_name;
2672 /* Also implies non-default tablespace. */
2673 info->tablespace = NULL;
2674 }
2675 if (part_elem->index_file_name != NULL
2676 && part_elem->index_file_name[0] != '\0') {
2677 info->index_file_name = part_elem->index_file_name;
2678 }
2679 if (part_elem->tablespace_name != NULL
2680 && part_elem->tablespace_name[0] != '\0') {
2681 info->tablespace = part_elem->tablespace_name;
2682 }
2683 }
2684
2685 /** Set flags and append '/' to remote path if necessary. */
2686 void
set_remote_path_flags()2687 create_table_info_t::set_remote_path_flags()
2688 {
2689 if (m_remote_path[0] != '\0') {
2690 ut_ad(DICT_TF_HAS_DATA_DIR(m_flags) != 0);
2691
2692 /* os_file_make_remote_pathname will truncate
2693 everything after the last '/', so append '/'
2694 if it is not the last character. */
2695
2696 size_t len = strlen(m_remote_path);
2697 if (m_remote_path[len - 1] != OS_PATH_SEPARATOR) {
2698 m_remote_path[len] = OS_PATH_SEPARATOR;
2699 m_remote_path[len + 1] = '\0';
2700 }
2701 } else {
2702 ut_ad(DICT_TF_HAS_DATA_DIR(m_flags) == 0);
2703 }
2704 }
2705
2706 /** Creates a new table to an InnoDB database.
2707 @param[in] name Table name (in filesystem charset).
2708 @param[in] form MySQL Table containing information of
2709 partitions, columns and indexes etc.
2710 @param[in] create_info Additional create information, like
2711 create statement string.
2712 @return 0 or error number. */
2713 int
create(const char * name,TABLE * form,HA_CREATE_INFO * create_info)2714 ha_innopart::create(
2715 const char* name,
2716 TABLE* form,
2717 HA_CREATE_INFO* create_info)
2718 {
2719 int error;
2720 /** {database}/{tablename} */
2721 char table_name[FN_REFLEN];
2722 /** absolute path of temp frm */
2723 char temp_path[FN_REFLEN];
2724 /** absolute path of table */
2725 char remote_path[FN_REFLEN];
2726 char partition_name[FN_REFLEN];
2727 char tablespace_name[NAME_LEN + 1];
2728 char* table_name_end;
2729 size_t table_name_len;
2730 size_t db_name_length;
2731 ulint stat_table_name_length;
2732 char* partition_name_start;
2733 char table_data_file_name[FN_REFLEN];
2734 char table_level_tablespace_name[NAME_LEN + 1];
2735 const char* index_file_name;
2736 size_t len;
2737
2738 create_table_info_t info(ha_thd(),
2739 form,
2740 create_info,
2741 table_name,
2742 temp_path,
2743 remote_path,
2744 tablespace_name);
2745
2746 DBUG_ENTER("ha_innopart::create");
2747
2748 if (is_shared_tablespace(create_info->tablespace)) {
2749 push_deprecated_warn_no_replacement(
2750 ha_thd(), PARTITION_IN_SHARED_TABLESPACE_WARNING);
2751 }
2752
2753 ut_ad(create_info != NULL);
2754 ut_ad(m_part_info == form->part_info);
2755 ut_ad(table_share != NULL);
2756
2757 /* Not allowed to create temporary partitioned tables. */
2758 if (create_info != NULL
2759 && (create_info->options & HA_LEX_CREATE_TMP_TABLE) != 0) {
2760 my_error(ER_PARTITION_NO_TEMPORARY, MYF(0));
2761 ut_ad(0); // Can we support partitioned temporary tables?
2762 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
2763 }
2764
2765 error = info.initialize();
2766 if (error != 0) {
2767 DBUG_RETURN(error);
2768 }
2769
2770 /* Setup and check table level options. */
2771 error = info.prepare_create_table(name);
2772 if (error != 0) {
2773 DBUG_RETURN(error);
2774 }
2775 ut_ad(temp_path[0] == '\0');
2776 db_name_length = strchr(table_name,'/') - table_name;
2777 strcpy(partition_name, table_name);
2778 partition_name_start = partition_name + strlen(partition_name);
2779 table_name_len = strlen(table_name);
2780 table_name_end = table_name + table_name_len;
2781 if (create_info->data_file_name != NULL) {
2782 /* Strip the tablename from the path. */
2783 strncpy(table_data_file_name, create_info->data_file_name,
2784 FN_REFLEN-1);
2785 table_data_file_name[FN_REFLEN - 1] = '\0';
2786 char* ptr = strrchr(table_data_file_name, OS_PATH_SEPARATOR);
2787 ut_ad(ptr != NULL);
2788 if (ptr != NULL) {
2789 ptr++;
2790 *ptr = '\0';
2791 create_info->data_file_name = table_data_file_name;
2792 }
2793 } else {
2794 table_data_file_name[0] = '\0';
2795 }
2796 index_file_name = create_info->index_file_name;
2797 if (create_info->tablespace != NULL) {
2798 strcpy(table_level_tablespace_name, create_info->tablespace);
2799 } else {
2800 table_level_tablespace_name[0] = '\0';
2801 }
2802
2803 info.allocate_trx();
2804
2805 /* Latch the InnoDB data dictionary exclusively so that no deadlocks
2806 or lock waits can happen in it during a table create operation.
2807 Drop table etc. do this latching in row0mysql.cc. */
2808
2809 row_mysql_lock_data_dictionary(info.trx());
2810
2811 /* Mismatch can occur in the length of the column "table_name" in
2812 mysql.innodb_table_stats and mysql.innodb_index_stats after the
2813 fix to increase the column length of table_name column to accomdate
2814 partition_names, so we first need to determine the length of the
2815 "table_name" column and accordingly we can decide the length
2816 of partition name .*/
2817
2818 dict_table_t *table = dict_table_get_low(TABLE_STATS_NAME);
2819 if (table != NULL) {
2820 ulint col_no = dict_table_has_column(table,"table_name",0);
2821 ut_ad (col_no != table->n_def);
2822 stat_table_name_length = table->cols[col_no].len;
2823 if (stat_table_name_length > NAME_LEN) {
2824 /* The maximum allowed length is 597 bytes
2825 ,but the file name length cannot cross
2826 FN_LEN */
2827 stat_table_name_length = FN_LEN;
2828 } else {
2829 stat_table_name_length = NAME_LEN;
2830 }
2831
2832 } else {
2833 /* set the old length of 192 bytes in case of failure */
2834 stat_table_name_length = NAME_LEN;
2835 ib::warn() << TABLE_STATS_NAME << " doesnt exist.";
2836 }
2837
2838 /* TODO: use the new DD tables instead to decrease duplicate info. */
2839 List_iterator_fast <partition_element>
2840 part_it(form->part_info->partitions);
2841 partition_element* part_elem;
2842 while ((part_elem = part_it++)) {
2843 /* Append the partition name to the table name. */
2844 len = Ha_innopart_share::append_sep_and_name(
2845 partition_name_start,
2846 part_elem->partition_name,
2847 part_sep,
2848 FN_REFLEN - table_name_len);
2849 /* Report error if the partition name with path separator
2850 exceeds maximum path length. */
2851 if ((table_name_len + len + sizeof "/") >= FN_REFLEN) {
2852 error = HA_ERR_INTERNAL_ERROR;
2853 my_error(ER_IDENT_CAUSES_TOO_LONG_PATH, MYF(0), FN_REFLEN,
2854 partition_name);
2855 goto cleanup;
2856 }
2857
2858 /* Report error if table name with partition name exceeds
2859 maximum file name length */
2860 if ((len + table_name_len - db_name_length - 1)
2861 > stat_table_name_length) {
2862 error = HA_ERR_INTERNAL_ERROR;
2863 my_error(ER_PATH_LENGTH, MYF(0),
2864 partition_name + db_name_length + 1 );
2865 goto cleanup;
2866 }
2867
2868 /* Override table level DATA/INDEX DIRECTORY. */
2869 set_create_info_dir(part_elem, create_info);
2870
2871 if (!form->part_info->is_sub_partitioned()) {
2872 if (is_shared_tablespace(part_elem->tablespace_name)) {
2873 push_deprecated_warn_no_replacement(
2874 ha_thd(), PARTITION_IN_SHARED_TABLESPACE_WARNING);
2875 }
2876
2877 error = info.prepare_create_table(partition_name);
2878 if (error != 0) {
2879 goto cleanup;
2880 }
2881 info.set_remote_path_flags();
2882 error = info.create_table();
2883 if (error != 0) {
2884 goto cleanup;
2885 }
2886 } else {
2887 size_t part_name_len = strlen(partition_name_start)
2888 + table_name_len;
2889 char* part_name_end = partition_name + part_name_len;
2890 List_iterator_fast <partition_element>
2891 sub_it(part_elem->subpartitions);
2892 partition_element* sub_elem;
2893
2894 while ((sub_elem = sub_it++)) {
2895 ut_ad(sub_elem->partition_name != NULL);
2896
2897 if (is_shared_tablespace(sub_elem->tablespace_name)) {
2898 push_deprecated_warn_no_replacement(
2899 ha_thd(), PARTITION_IN_SHARED_TABLESPACE_WARNING);
2900 }
2901
2902 /* 'table' will be
2903 <name>#P#<part_name>#SP#<subpart_name>.
2904 Append the sub-partition name to
2905 the partition name. */
2906
2907 len = Ha_innopart_share::append_sep_and_name(
2908 part_name_end,
2909 sub_elem->partition_name,
2910 sub_sep,
2911 FN_REFLEN - part_name_len);
2912 /* Report error if the partition name with path separator
2913 exceeds maximum path length. */
2914 if ((len + part_name_len + sizeof "/") >= FN_REFLEN) {
2915 error = HA_ERR_INTERNAL_ERROR;
2916 my_error(ER_IDENT_CAUSES_TOO_LONG_PATH, MYF(0),
2917 FN_REFLEN,
2918 partition_name);
2919 goto cleanup;
2920 }
2921
2922 /* Report error if table name with partition
2923 name exceeds maximum file name length */
2924 if ((len + part_name_len - db_name_length -1)
2925 > stat_table_name_length ) {
2926 error = HA_ERR_INTERNAL_ERROR;;
2927 my_error(ER_PATH_LENGTH, MYF(0),
2928 partition_name + db_name_length + 1);
2929 goto cleanup;
2930 }
2931
2932 /* Override part level DATA/INDEX DIRECTORY. */
2933 set_create_info_dir(sub_elem, create_info);
2934
2935 Ha_innopart_share::partition_name_casedn_str(
2936 part_name_end + 4);
2937 error = info.prepare_create_table(partition_name);
2938 if (error != 0) {
2939 goto cleanup;
2940 }
2941 info.set_remote_path_flags();
2942 error = info.create_table();
2943 if (error != 0) {
2944 goto cleanup;
2945 }
2946
2947 /* Reset partition level
2948 DATA/INDEX DIRECTORY. */
2949
2950 create_info->data_file_name =
2951 table_data_file_name;
2952 create_info->index_file_name =
2953 index_file_name;
2954 create_info->tablespace =
2955 table_level_tablespace_name;
2956 set_create_info_dir(part_elem, create_info);
2957 }
2958 }
2959 /* Reset table level DATA/INDEX DIRECTORY. */
2960 create_info->data_file_name = table_data_file_name;
2961 create_info->index_file_name = index_file_name;
2962 create_info->tablespace = table_level_tablespace_name;
2963 }
2964
2965 innobase_commit_low(info.trx());
2966
2967 row_mysql_unlock_data_dictionary(info.trx());
2968
2969 /* Flush the log to reduce probability that the .frm files and
2970 the InnoDB data dictionary get out-of-sync if the user runs
2971 with innodb_flush_log_at_trx_commit = 0. */
2972
2973 log_buffer_flush_to_disk();
2974
2975 part_it.rewind();
2976 /* No need to use these now, only table_name will be used. */
2977 create_info->data_file_name = NULL;
2978 create_info->index_file_name = NULL;
2979 while ((part_elem = part_it++)) {
2980 len = Ha_innopart_share::append_sep_and_name(
2981 table_name_end,
2982 part_elem->partition_name,
2983 part_sep,
2984 FN_REFLEN - table_name_len);
2985
2986 if (!form->part_info->is_sub_partitioned()) {
2987 error = info.create_table_update_dict();
2988 if (error != 0) {
2989 ut_ad(0);
2990 goto end;
2991 }
2992 } else {
2993 size_t part_name_len = strlen(table_name_end);
2994 char* part_name_end = table_name_end + part_name_len;
2995 List_iterator_fast <partition_element>
2996 sub_it(part_elem->subpartitions);
2997 partition_element* sub_elem;
2998 while ((sub_elem = sub_it++)) {
2999 len = Ha_innopart_share::append_sep_and_name(
3000 part_name_end,
3001 sub_elem->partition_name,
3002 sub_sep,
3003 FN_REFLEN - table_name_len
3004 - part_name_len);
3005
3006 error = info.create_table_update_dict();
3007 if (error != 0) {
3008 ut_ad(0);
3009 goto end;
3010 }
3011 }
3012 }
3013 }
3014
3015 end:
3016 /* Tell the InnoDB server that there might be work for
3017 utility threads: */
3018
3019 srv_active_wake_master_thread();
3020
3021 trx_free_for_mysql(info.trx());
3022
3023 DBUG_RETURN(error);
3024
3025 cleanup:
3026 trx_rollback_for_mysql(info.trx());
3027
3028 row_mysql_unlock_data_dictionary(info.trx());
3029
3030 ulint dummy;
3031 char norm_name[FN_REFLEN];
3032
3033 normalize_table_name(norm_name, name);
3034
3035 uint lent = (uint)strlen(norm_name);
3036 ut_a(lent < FN_REFLEN);
3037 norm_name[lent] = '#';
3038 norm_name[lent + 1] = 0;
3039
3040 row_drop_database_for_mysql(norm_name, info.trx(), &dummy);
3041
3042 trx_free_for_mysql(info.trx());
3043 DBUG_RETURN(error);
3044 }
3045
3046 /** Discards or imports an InnoDB tablespace.
3047 @param[in] discard True if discard, else import.
3048 @return 0 or error number. */
3049 int
discard_or_import_tablespace(my_bool discard)3050 ha_innopart::discard_or_import_tablespace(
3051 my_bool discard)
3052 {
3053 int error = 0;
3054 uint i;
3055 DBUG_ENTER("ha_innopart::discard_or_import_tablespace");
3056
3057 for (i= m_part_info->get_first_used_partition();
3058 i < m_tot_parts;
3059 i= m_part_info->get_next_used_partition(i)) {
3060
3061 m_prebuilt->table = m_part_share->get_table_part(i);
3062 error= ha_innobase::discard_or_import_tablespace(discard);
3063 if (error != 0) {
3064 break;
3065 }
3066 }
3067 m_prebuilt->table = m_part_share->get_table_part(0);
3068
3069 /* IMPORT/DISCARD also means resetting auto_increment. Make sure
3070 that auto_increment initialization is done after all partitions
3071 are imported. */
3072 if (table->found_next_number_field != NULL) {
3073 lock_auto_increment();
3074 m_part_share->next_auto_inc_val = 0;
3075 m_part_share->auto_inc_initialized = false;
3076 unlock_auto_increment();
3077 }
3078
3079 DBUG_RETURN(error);
3080 }
3081
3082 /** This function reads zip dict-related info from the base class.
3083 @param thd Thread handler
3084 @param part_name Must be always NULL.
3085 */
update_field_defs_with_zip_dict_info(THD * thd,const char * part_name)3086 void ha_innopart::update_field_defs_with_zip_dict_info(THD* thd,
3087 const char* part_name)
3088 {
3089 DBUG_ENTER("ha_innopart::update_field_defs_with_zip_dict_info");
3090 char partition_name[FN_REFLEN];
3091 bool res = get_first_partition_name(
3092 thd, this, table_share->normalized_path.str,
3093 table_share->partition_info_str,
3094 table_share->partition_info_str_len, partition_name);
3095 if (res)
3096 {
3097 ut_ad(0);
3098 DBUG_VOID_RETURN;
3099 }
3100
3101 ha_innobase::update_field_defs_with_zip_dict_info(thd, partition_name);
3102 DBUG_VOID_RETURN;
3103 }
3104
3105 /** Compare key and rowid.
3106 Helper function for sorting records in the priority queue.
3107 a/b points to table->record[0] rows which must have the
3108 key fields set. The bytes before a and b store the rowid.
3109 This is used for comparing/sorting rows first according to
3110 KEY and if same KEY, by rowid (ref).
3111 @param[in] key_info Null terminated array of index information.
3112 @param[in] a Pointer to record+ref in first record.
3113 @param[in] b Pointer to record+ref in second record.
3114 @return Return value is SIGN(first_rec - second_rec)
3115 @retval 0 Keys are equal.
3116 @retval -1 second_rec is greater than first_rec.
3117 @retval +1 first_rec is greater than second_rec. */
3118 int
key_and_rowid_cmp(KEY ** key_info,uchar * a,uchar * b)3119 ha_innopart::key_and_rowid_cmp(
3120 KEY** key_info,
3121 uchar *a,
3122 uchar *b)
3123 {
3124 int cmp = key_rec_cmp(key_info, a, b);
3125 if (cmp != 0) {
3126 return(cmp);
3127 }
3128
3129 /* We must compare by rowid, which is added before the record,
3130 in the priority queue. */
3131
3132 return(memcmp(a - DATA_ROW_ID_LEN, b - DATA_ROW_ID_LEN,
3133 DATA_ROW_ID_LEN));
3134 }
3135
3136 /** Extra hints from MySQL.
3137 @param[in] operation Operation hint.
3138 @return 0 or error number. */
3139 int
extra(enum ha_extra_function operation)3140 ha_innopart::extra(
3141 enum ha_extra_function operation)
3142 {
3143 if (operation == HA_EXTRA_SECONDARY_SORT_ROWID) {
3144 /* index_init(sorted=true) must have been called! */
3145 ut_ad(m_ordered);
3146 ut_ad(m_ordered_rec_buffer != NULL);
3147 /* No index_read call must have been done! */
3148 ut_ad(m_queue->empty());
3149
3150 /* If not PK is set as secondary sort, do secondary sort by
3151 rowid/ref. */
3152
3153 ut_ad(m_curr_key_info[1] != NULL
3154 || m_prebuilt->clust_index_was_generated != 0
3155 || m_curr_key_info[0]
3156 == table->key_info + table->s->primary_key);
3157
3158 if (m_curr_key_info[1] == NULL
3159 && m_prebuilt->clust_index_was_generated) {
3160 m_ref_usage = Partition_helper::REF_USED_FOR_SORT;
3161 m_queue->m_fun = key_and_rowid_cmp;
3162 }
3163 return(0);
3164 }
3165 return(ha_innobase::extra(operation));
3166 }
3167
3168 /** Delete all rows in a partition.
3169 @return 0 or error number. */
3170 int
truncate_partition_low()3171 ha_innopart::truncate_partition_low()
3172 {
3173 return(truncate());
3174 }
3175
3176 /** Deletes all rows of a partitioned InnoDB table.
3177 @return 0 or error number. */
3178 int
truncate()3179 ha_innopart::truncate()
3180 {
3181 dberr_t err = DB_SUCCESS;
3182 int error;
3183
3184 DBUG_ENTER("ha_innopart::truncate");
3185
3186 if (high_level_read_only) {
3187 DBUG_RETURN(HA_ERR_TABLE_READONLY);
3188 }
3189
3190 /* TRUNCATE also means resetting auto_increment. Hence, reset
3191 it so that it will be initialized again at the next use. */
3192
3193 if (table->found_next_number_field != NULL) {
3194 lock_auto_increment();
3195 m_part_share->next_auto_inc_val= 0;
3196 m_part_share->auto_inc_initialized= false;
3197 unlock_auto_increment();
3198 }
3199
3200 /* Get the transaction associated with the current thd, or create one
3201 if not yet created, and update m_prebuilt->trx. */
3202
3203 update_thd(ha_thd());
3204
3205 if (!trx_is_started(m_prebuilt->trx)) {
3206 ++m_prebuilt->trx->will_lock;
3207 }
3208 /* Truncate the table in InnoDB. */
3209
3210 for (uint i = m_part_info->get_first_used_partition();
3211 i < m_tot_parts;
3212 i = m_part_info->get_next_used_partition(i)) {
3213
3214 set_partition(i);
3215 err = row_truncate_table_for_mysql(m_prebuilt->table,
3216 m_prebuilt->trx);
3217 update_partition(i);
3218 if (err != DB_SUCCESS) {
3219 break;
3220 }
3221 }
3222
3223 switch (err) {
3224
3225 case DB_TABLESPACE_DELETED:
3226 case DB_TABLESPACE_NOT_FOUND:
3227 ib_senderrf(
3228 m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
3229 (err == DB_TABLESPACE_DELETED ?
3230 ER_TABLESPACE_DISCARDED : ER_TABLESPACE_MISSING),
3231 table->s->table_name.str);
3232 table->status = STATUS_NOT_FOUND;
3233 error = HA_ERR_NO_SUCH_TABLE;
3234 break;
3235
3236 default:
3237 error = convert_error_code_to_mysql(
3238 err, m_prebuilt->table->flags,
3239 m_prebuilt->trx->mysql_thd);
3240 table->status = STATUS_NOT_FOUND;
3241 break;
3242 }
3243 DBUG_RETURN(error);
3244 }
3245
3246 #ifdef WL6742
3247
3248 /* Removing Wl6742 as part of Bug#23046302 */
3249
3250 /** Total number of rows in all used partitions.
3251 Returns the exact number of records that this client can see using this
3252 handler object.
3253 @param[out] num_rows Number of rows.
3254 @return 0 or error number. */
3255 int
records(ha_rows * num_rows)3256 ha_innopart::records(
3257 ha_rows* num_rows)
3258 {
3259 ha_rows n_rows;
3260 int err;
3261 DBUG_ENTER("ha_innopart::records()");
3262
3263 *num_rows = 0;
3264
3265 /* The index scan is probably so expensive, so the overhead
3266 of the rest of the function is neglectable for each partition.
3267 So no current reason for optimizing this further. */
3268
3269 for (uint i = m_part_info->get_first_used_partition();
3270 i < m_tot_parts;
3271 i = m_part_info->get_next_used_partition(i)) {
3272
3273 set_partition(i);
3274 err = ha_innobase::records(&n_rows);
3275 update_partition(i);
3276 if (err != 0) {
3277 *num_rows = HA_POS_ERROR;
3278 DBUG_RETURN(err);
3279 }
3280 *num_rows += n_rows;
3281 }
3282 DBUG_RETURN(0);
3283 }
3284 #endif
3285
3286 /** Estimates the number of index records in a range.
3287 @param[in] keynr Index number.
3288 @param[in] min_key Start key value (or NULL).
3289 @param[in] max_key End key value (or NULL).
3290 @return estimated number of rows. */
3291 ha_rows
records_in_range(uint keynr,key_range * min_key,key_range * max_key)3292 ha_innopart::records_in_range(
3293 uint keynr,
3294 key_range* min_key,
3295 key_range* max_key)
3296 {
3297 KEY* key;
3298 dict_index_t* index;
3299 dtuple_t* range_start;
3300 dtuple_t* range_end;
3301 int64_t n_rows = 0;
3302 page_cur_mode_t mode1;
3303 page_cur_mode_t mode2;
3304 mem_heap_t* heap;
3305 uint part_id;
3306
3307 DBUG_ENTER("ha_innopart::records_in_range");
3308 DBUG_PRINT("info", ("keynr %u min %p max %p", keynr, min_key, max_key));
3309
3310 ha_rows ret = innodb_records_in_range(ha_thd());
3311 if (ret) {
3312 DBUG_RETURN(ret);
3313 }
3314 if (table->force_index) {
3315 const ha_rows force_rows = innodb_force_index_records_in_range(ha_thd());
3316 if (force_rows) {
3317 DBUG_RETURN(force_rows);
3318 }
3319 }
3320
3321 ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
3322
3323 m_prebuilt->trx->op_info = (char*)"estimating records in index range";
3324
3325 /* In case MySQL calls this in the middle of a SELECT query, release
3326 possible adaptive hash latch to avoid deadlocks of threads. */
3327
3328 trx_search_latch_release_if_reserved(m_prebuilt->trx);
3329
3330 active_index = keynr;
3331
3332 key = table->key_info + active_index;
3333
3334 part_id = m_part_info->get_first_used_partition();
3335 if (part_id == MY_BIT_NONE) {
3336 DBUG_RETURN(0);
3337 }
3338 /* This also sets m_prebuilt->index! */
3339 set_partition(part_id);
3340 index = m_prebuilt->index;
3341
3342 /* There exists possibility of not being able to find requested
3343 index due to inconsistency between MySQL and InoDB dictionary info.
3344 Necessary message should have been printed in innopart_get_index(). */
3345 if (index == NULL
3346 || dict_table_is_discarded(m_prebuilt->table)
3347 || !row_merge_is_index_usable(m_prebuilt->trx, index)) {
3348
3349 n_rows = HA_POS_ERROR;
3350 goto func_exit;
3351 }
3352
3353 heap = mem_heap_create(2 * (key->actual_key_parts * sizeof(dfield_t)
3354 + sizeof(dtuple_t)));
3355
3356 range_start = dtuple_create(heap, key->actual_key_parts);
3357 dict_index_copy_types(range_start, index, key->actual_key_parts);
3358
3359 range_end = dtuple_create(heap, key->actual_key_parts);
3360 dict_index_copy_types(range_end, index, key->actual_key_parts);
3361
3362 row_sel_convert_mysql_key_to_innobase(
3363 range_start,
3364 m_prebuilt->srch_key_val1,
3365 m_prebuilt->srch_key_val_len,
3366 index,
3367 (byte*) (min_key ? min_key->key : (const uchar*) 0),
3368 (ulint) (min_key ? min_key->length : 0),
3369 m_prebuilt->trx);
3370
3371 ut_ad(min_key != NULL
3372 ? range_start->n_fields > 0
3373 : range_start->n_fields == 0);
3374
3375 row_sel_convert_mysql_key_to_innobase(
3376 range_end,
3377 m_prebuilt->srch_key_val2,
3378 m_prebuilt->srch_key_val_len,
3379 index,
3380 (byte*) (max_key != NULL ? max_key->key : (const uchar*) 0),
3381 (ulint) (max_key != NULL ? max_key->length : 0),
3382 m_prebuilt->trx);
3383
3384 ut_ad(max_key != NULL
3385 ? range_end->n_fields > 0
3386 : range_end->n_fields == 0);
3387
3388 mode1 = convert_search_mode_to_innobase(min_key ? min_key->flag :
3389 HA_READ_KEY_EXACT);
3390 mode2 = convert_search_mode_to_innobase(max_key ? max_key->flag :
3391 HA_READ_KEY_EXACT);
3392
3393 if (mode1 != PAGE_CUR_UNSUPP && mode2 != PAGE_CUR_UNSUPP) {
3394
3395 n_rows = btr_estimate_n_rows_in_range(index, range_start,
3396 mode1, range_end,
3397 mode2);
3398 DBUG_PRINT("info", ("part_id %u rows %ld", part_id,
3399 (long int) n_rows));
3400 for (part_id = m_part_info->get_next_used_partition(part_id);
3401 part_id < m_tot_parts;
3402 part_id = m_part_info->get_next_used_partition(part_id)) {
3403
3404 index = m_part_share->get_index(part_id, keynr);
3405 /* Individual partitions can be discarded
3406 we need to check each partition */
3407 if (index == NULL
3408 || dict_table_is_discarded(index->table)
3409 || !row_merge_is_index_usable(m_prebuilt->trx,index))
3410 {
3411
3412 n_rows = HA_POS_ERROR;
3413 mem_heap_free(heap);
3414 goto func_exit;
3415 }
3416 int64_t n = btr_estimate_n_rows_in_range(index,
3417 range_start,
3418 mode1,
3419 range_end,
3420 mode2);
3421 n_rows += n;
3422 DBUG_PRINT("info", ("part_id %u rows %ld (%ld)",
3423 part_id,
3424 (long int) n,
3425 (long int) n_rows));
3426 }
3427 } else {
3428
3429 n_rows = HA_POS_ERROR;
3430 }
3431
3432 mem_heap_free(heap);
3433
3434 func_exit:
3435
3436 m_prebuilt->trx->op_info = (char*)"";
3437
3438 /* The MySQL optimizer seems to believe an estimate of 0 rows is
3439 always accurate and may return the result 'Empty set' based on that.
3440 The accuracy is not guaranteed, and even if it were, for a locking
3441 read we should anyway perform the search to set the next-key lock.
3442 Add 1 to the value to make sure MySQL does not make the assumption! */
3443
3444 if (n_rows == 0) {
3445 n_rows = 1;
3446 }
3447
3448 DBUG_RETURN((ha_rows) n_rows);
3449 }
3450
3451 /** Gives an UPPER BOUND to the number of rows in a table.
3452 This is used in filesort.cc.
3453 @return upper bound of rows. */
3454 ha_rows
estimate_rows_upper_bound()3455 ha_innopart::estimate_rows_upper_bound()
3456 {
3457 const dict_index_t* index;
3458 ulonglong estimate = 0;
3459 ulonglong local_data_file_length;
3460 ulint stat_n_leaf_pages;
3461
3462 DBUG_ENTER("ha_innopart::estimate_rows_upper_bound");
3463
3464 /* We do not know if MySQL can call this function before calling
3465 external_lock(). To be safe, update the thd of the current table
3466 handle. */
3467
3468 update_thd(ha_thd());
3469
3470 m_prebuilt->trx->op_info = "calculating upper bound for table rows";
3471
3472 /* In case MySQL calls this in the middle of a SELECT query, release
3473 possible adaptive hash latch to avoid deadlocks of threads. */
3474
3475 trx_search_latch_release_if_reserved(m_prebuilt->trx);
3476
3477 for (uint i = m_part_info->get_first_used_partition();
3478 i < m_tot_parts;
3479 i = m_part_info->get_next_used_partition(i)) {
3480
3481 m_prebuilt->table = m_part_share->get_table_part(i);
3482 index = dict_table_get_first_index(m_prebuilt->table);
3483
3484 stat_n_leaf_pages = index->stat_n_leaf_pages;
3485
3486 ut_a(stat_n_leaf_pages > 0);
3487
3488 local_data_file_length =
3489 ((ulonglong) stat_n_leaf_pages) * UNIV_PAGE_SIZE;
3490
3491 /* Calculate a minimum length for a clustered index record
3492 and from that an upper bound for the number of rows.
3493 Since we only calculate new statistics in row0mysql.cc when a
3494 table has grown by a threshold factor,
3495 we must add a safety factor 2 in front of the formula below. */
3496
3497 estimate += 2 * local_data_file_length
3498 / dict_index_calc_min_rec_len(index);
3499 }
3500
3501 m_prebuilt->trx->op_info = "";
3502
3503 DBUG_RETURN((ha_rows) estimate);
3504 }
3505
3506 /** Time estimate for full table scan.
3507 How many seeks it will take to read through the table. This is to be
3508 comparable to the number returned by records_in_range so that we can
3509 decide if we should scan the table or use keys.
3510 @return estimated time measured in disk seeks. */
3511 double
scan_time()3512 ha_innopart::scan_time()
3513 {
3514 double scan_time = 0.0;
3515 DBUG_ENTER("ha_innopart::scan_time");
3516
3517 for (uint i = m_part_info->get_first_used_partition();
3518 i < m_tot_parts;
3519 i = m_part_info->get_next_used_partition(i)) {
3520 m_prebuilt->table = m_part_share->get_table_part(i);
3521 scan_time += ha_innobase::scan_time();
3522 }
3523 DBUG_RETURN(scan_time);
3524 }
3525
3526 /** Updates the statistics for one partition (table).
3527 @param[in] table Table to update the statistics for.
3528 @param[in] is_analyze True if called from ::analyze().
3529 @return error code. */
3530 static
3531 int
update_table_stats(dict_table_t * table,bool is_analyze)3532 update_table_stats(
3533 dict_table_t* table,
3534 bool is_analyze)
3535 {
3536 dict_stats_upd_option_t opt;
3537 dberr_t ret;
3538
3539 if (dict_stats_is_persistent_enabled(table)) {
3540 if (is_analyze) {
3541 opt = DICT_STATS_RECALC_PERSISTENT;
3542 } else {
3543 /* This is e.g. 'SHOW INDEXES',
3544 fetch the persistent stats from disk. */
3545 opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
3546 }
3547 } else {
3548 opt = DICT_STATS_RECALC_TRANSIENT;
3549 }
3550
3551 ut_ad(!mutex_own(&dict_sys->mutex));
3552 ret = dict_stats_update(table, opt);
3553
3554 if (ret != DB_SUCCESS) {
3555 return(HA_ERR_GENERIC);
3556 }
3557 return(0);
3558 }
3559
3560 /** Updates and return statistics.
3561 Returns statistics information of the table to the MySQL interpreter,
3562 in various fields of the handle object.
3563 @param[in] flag Flags for what to update and return.
3564 @param[in] is_analyze True if called from ::analyze().
3565 @return HA_ERR_* error code or 0. */
3566 int
info_low(uint flag,bool is_analyze)3567 ha_innopart::info_low(
3568 uint flag,
3569 bool is_analyze)
3570 {
3571 dict_table_t* ib_table;
3572 ib_uint64_t max_rows = 0;
3573 uint biggest_partition = 0;
3574 int error = 0;
3575
3576 DBUG_ENTER("ha_innopart::info_low");
3577
3578 /* If we are forcing recovery at a high level, we will suppress
3579 statistics calculation on tables, because that may crash the
3580 server if an index is badly corrupted. */
3581
3582 /* We do not know if MySQL can call this function before calling
3583 external_lock(). To be safe, update the thd of the current table
3584 handle. */
3585
3586 update_thd(ha_thd());
3587
3588 /* In case MySQL calls this in the middle of a SELECT query, release
3589 possible adaptive hash latch to avoid deadlocks of threads. */
3590
3591 m_prebuilt->trx->op_info = (char*)"returning various info to MySQL";
3592
3593 trx_search_latch_release_if_reserved(m_prebuilt->trx);
3594
3595 ut_ad(m_part_share->get_table_part(0)->n_ref_count > 0);
3596
3597 if ((flag & HA_STATUS_TIME) != 0) {
3598 stats.update_time = 0;
3599
3600 if (is_analyze) {
3601 /* Only analyze the given partitions. */
3602 int error = set_altered_partitions();
3603 if (error != 0) {
3604 /* Already checked in mysql_admin_table! */
3605 ut_ad(0);
3606 DBUG_RETURN(error);
3607 }
3608 }
3609 if (is_analyze || innobase_stats_on_metadata) {
3610 m_prebuilt->trx->op_info = "updating table statistics";
3611 }
3612
3613 /* TODO: Only analyze the PK for all partitions,
3614 then the secondary indexes only for the largest partition! */
3615 for (uint i = m_part_info->get_first_used_partition();
3616 i < m_tot_parts;
3617 i = m_part_info->get_next_used_partition(i)) {
3618
3619 ib_table = m_part_share->get_table_part(i);
3620 if (is_analyze || innobase_stats_on_metadata) {
3621 error = update_table_stats(ib_table, is_analyze);
3622 if (error != 0) {
3623 m_prebuilt->trx->op_info = "";
3624 DBUG_RETURN(error);
3625 }
3626 }
3627 set_if_bigger(stats.update_time,
3628 (ulong) ib_table->update_time);
3629 }
3630
3631 if (is_analyze || innobase_stats_on_metadata) {
3632 m_prebuilt->trx->op_info =
3633 "returning various info to MySQL";
3634 }
3635 }
3636
3637 if ((flag & HA_STATUS_VARIABLE) != 0) {
3638
3639 /* TODO: If this is called after pruning, then we could
3640 also update the statistics according to the non-pruned
3641 partitions, by allocating new rec_per_key on the TABLE,
3642 instead of using the info from the TABLE_SHARE. */
3643 ulint stat_clustered_index_size = 0;
3644 ulint stat_sum_of_other_index_sizes = 0;
3645 ib_uint64_t n_rows = 0;
3646 ulint avail_space = 0;
3647 bool checked_sys_tablespace = false;
3648
3649 if ((flag & HA_STATUS_VARIABLE_EXTRA) != 0) {
3650 stats.delete_length = 0;
3651 }
3652
3653 for (uint i = m_part_info->get_first_used_partition();
3654 i < m_tot_parts;
3655 i = m_part_info->get_next_used_partition(i)) {
3656
3657 ib_table = m_part_share->get_table_part(i);
3658 if ((flag & HA_STATUS_NO_LOCK) == 0) {
3659 dict_table_stats_lock(ib_table, RW_S_LATCH);
3660 }
3661
3662 ut_a(ib_table->stat_initialized);
3663
3664 n_rows += ib_table->stat_n_rows;
3665 if (ib_table->stat_n_rows > max_rows) {
3666 max_rows = ib_table->stat_n_rows;
3667 biggest_partition = i;
3668 }
3669
3670 stat_clustered_index_size +=
3671 ib_table->stat_clustered_index_size;
3672
3673 stat_sum_of_other_index_sizes +=
3674 ib_table->stat_sum_of_other_index_sizes;
3675
3676 if ((flag & HA_STATUS_NO_LOCK) == 0) {
3677 dict_table_stats_unlock(ib_table, RW_S_LATCH);
3678 }
3679
3680 if ((flag & HA_STATUS_VARIABLE_EXTRA) != 0
3681 && (flag & HA_STATUS_NO_LOCK) == 0
3682 && srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE
3683 && avail_space != ULINT_UNDEFINED) {
3684
3685 /* Only count system tablespace once! */
3686 if (is_system_tablespace(ib_table->space)) {
3687 if (checked_sys_tablespace) {
3688 continue;
3689 }
3690 checked_sys_tablespace = true;
3691 }
3692
3693 uintmax_t space =
3694 fsp_get_available_space_in_free_extents(
3695 ib_table->space);
3696 if (space == UINTMAX_MAX) {
3697 THD* thd = ha_thd();
3698 const char* table_name
3699 = ib_table->name.m_name;
3700
3701 push_warning_printf(
3702 thd,
3703 Sql_condition::SL_WARNING,
3704 ER_CANT_GET_STAT,
3705 "InnoDB: Trying to get the"
3706 " free space for partition %s"
3707 " but its tablespace has been"
3708 " discarded or the .ibd file"
3709 " is missing. Setting the free"
3710 " space of the partition to"
3711 " zero.",
3712 ut_get_name(
3713 m_prebuilt->trx,
3714 table_name).c_str());
3715 } else {
3716 avail_space +=
3717 static_cast<ulint>(space);
3718 }
3719 }
3720 }
3721
3722 /*
3723 The MySQL optimizer seems to assume in a left join that n_rows
3724 is an accurate estimate if it is zero. Of course, it is not,
3725 since we do not have any locks on the rows yet at this phase.
3726 Since SHOW TABLE STATUS seems to call this function with the
3727 HA_STATUS_TIME flag set, while the left join optimizer does not
3728 set that flag, we add one to a zero value if the flag is not
3729 set. That way SHOW TABLE STATUS will show the best estimate,
3730 while the optimizer never sees the table empty. */
3731
3732 if (n_rows == 0 && (flag & HA_STATUS_TIME) == 0) {
3733 n_rows++;
3734 }
3735
3736 /* Fix bug#40386: Not flushing query cache after truncate.
3737 n_rows can not be 0 unless the table is empty, set to 1
3738 instead. The original problem of bug#29507 is actually
3739 fixed in the server code. */
3740 if (thd_sql_command(m_user_thd) == SQLCOM_TRUNCATE) {
3741
3742 n_rows = 1;
3743
3744 /* We need to reset the m_prebuilt value too, otherwise
3745 checks for values greater than the last value written
3746 to the table will fail and the autoinc counter will
3747 not be updated. This will force write_row() into
3748 attempting an update of the table's AUTOINC counter. */
3749
3750 m_prebuilt->autoinc_last_value = 0;
3751 }
3752
3753 /* Take page_size from first partition. */
3754 ib_table = m_part_share->get_table_part(0);
3755 const page_size_t& page_size =
3756 dict_table_page_size(ib_table);
3757
3758 stats.records = (ha_rows) n_rows;
3759 stats.deleted = 0;
3760 stats.data_file_length =
3761 ((ulonglong) stat_clustered_index_size)
3762 * page_size.physical();
3763 stats.index_file_length =
3764 ((ulonglong) stat_sum_of_other_index_sizes)
3765 * page_size.physical();
3766
3767 /* See ha_innobase::info_low() for comments! */
3768 if ((flag & HA_STATUS_NO_LOCK) == 0
3769 && (flag & HA_STATUS_VARIABLE_EXTRA) != 0
3770 && srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) {
3771 stats.delete_length = avail_space * 1024;
3772 }
3773
3774 stats.check_time = 0;
3775 stats.mrr_length_per_rec = ref_length + sizeof(void*)
3776 - PARTITION_BYTES_IN_POS;
3777
3778 if (stats.records == 0) {
3779 stats.mean_rec_length = 0;
3780 } else {
3781 stats.mean_rec_length = (ulong)
3782 (stats.data_file_length / stats.records);
3783 }
3784 }
3785
3786 if ((flag & HA_STATUS_CONST) != 0) {
3787 /* Find max rows and biggest partition. */
3788 for (uint i = 0; i < m_tot_parts; i++) {
3789 /* Skip partitions from above. */
3790 if ((flag & HA_STATUS_VARIABLE) == 0
3791 || !bitmap_is_set(&(m_part_info->read_partitions),
3792 i)) {
3793
3794 ib_table = m_part_share->get_table_part(i);
3795 if (ib_table->stat_n_rows > max_rows) {
3796 max_rows = ib_table->stat_n_rows;
3797 biggest_partition = i;
3798 }
3799 }
3800 }
3801 ib_table = m_part_share->get_table_part(biggest_partition);
3802 /* Verify the number of index in InnoDB and MySQL
3803 matches up. If m_prebuilt->clust_index_was_generated
3804 holds, InnoDB defines GEN_CLUST_INDEX internally. */
3805 ulint num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
3806 - m_prebuilt->clust_index_was_generated;
3807 if (table->s->keys < num_innodb_index) {
3808 /* If there are too many indexes defined
3809 inside InnoDB, ignore those that are being
3810 created, because MySQL will only consider
3811 the fully built indexes here. */
3812
3813 for (const dict_index_t* index =
3814 UT_LIST_GET_FIRST(ib_table->indexes);
3815 index != NULL;
3816 index = UT_LIST_GET_NEXT(indexes, index)) {
3817
3818 /* First, online index creation is
3819 completed inside InnoDB, and then
3820 MySQL attempts to upgrade the
3821 meta-data lock so that it can rebuild
3822 the .frm file. If we get here in that
3823 time frame, dict_index_is_online_ddl()
3824 would not hold and the index would
3825 still not be included in TABLE_SHARE. */
3826 if (!index->is_committed()) {
3827 num_innodb_index--;
3828 }
3829 }
3830
3831 if (table->s->keys < num_innodb_index
3832 && (innobase_fts_check_doc_id_index(ib_table,
3833 NULL, NULL)
3834 == FTS_EXIST_DOC_ID_INDEX)) {
3835 num_innodb_index--;
3836 }
3837 }
3838
3839 if (table->s->keys != num_innodb_index) {
3840 ib::error() << "Table "
3841 << ib_table->name << " contains "
3842 << num_innodb_index
3843 << " indexes inside InnoDB, which"
3844 " is different from the number of"
3845 " indexes " << table->s->keys
3846 << " defined in the MySQL";
3847 }
3848
3849 if ((flag & HA_STATUS_NO_LOCK) == 0) {
3850 dict_table_stats_lock(ib_table, RW_S_LATCH);
3851 }
3852
3853 ut_a(ib_table->stat_initialized);
3854
3855 for (ulong i = 0; i < table->s->keys; i++) {
3856 ulong j;
3857 /* We could get index quickly through internal
3858 index mapping with the index translation table.
3859 The identity of index (match up index name with
3860 that of table->key_info[i]) is already verified in
3861 innopart_get_index(). */
3862 dict_index_t* index = innopart_get_index(
3863 biggest_partition, i);
3864
3865 if (index == NULL) {
3866 ib::error() << "Table "
3867 << ib_table->name << " contains fewer"
3868 " indexes inside InnoDB than"
3869 " are defined in the MySQL"
3870 " .frm file. Have you mixed up"
3871 " .frm files from different"
3872 " installations? "
3873 << TROUBLESHOOTING_MSG;
3874 break;
3875 }
3876
3877 KEY* key = &table->key_info[i];
3878 for (j = 0;
3879 j < key->actual_key_parts;
3880 j++) {
3881
3882 if ((key->flags & HA_FULLTEXT) != 0) {
3883 /* The whole concept has no validity
3884 for FTS indexes. */
3885 key->rec_per_key[j] = 1;
3886 continue;
3887 }
3888
3889 if ((j + 1) > index->n_uniq) {
3890 ib::error() << "Index " << index->name
3891 << " of " << ib_table->name
3892 << " has " << index->n_uniq
3893 << " columns unique inside"
3894 " InnoDB, but MySQL is"
3895 " asking statistics for "
3896 << j + 1 << " columns. Have"
3897 " you mixed up .frm files"
3898 " from different"
3899 " installations? "
3900 << TROUBLESHOOTING_MSG;
3901 break;
3902 }
3903
3904 /* innodb_rec_per_key() will use
3905 index->stat_n_diff_key_vals[] and the value we
3906 pass index->table->stat_n_rows. Both are
3907 calculated by ANALYZE and by the background
3908 stats gathering thread (which kicks in when too
3909 much of the table has been changed). In
3910 addition table->stat_n_rows is adjusted with
3911 each DML (e.g. ++ on row insert). Those
3912 adjustments are not MVCC'ed and not even
3913 reversed on rollback. So,
3914 index->stat_n_diff_key_vals[] and
3915 index->table->stat_n_rows could have been
3916 calculated at different time. This is
3917 acceptable. */
3918 const rec_per_key_t rec_per_key =
3919 innodb_rec_per_key(
3920 index, j,
3921 max_rows);
3922
3923 key->set_records_per_key(j, rec_per_key);
3924
3925 /* The code below is legacy and should be
3926 removed together with this comment once we
3927 are sure the new floating point rec_per_key,
3928 set via set_records_per_key(), works fine. */
3929
3930 ulong rec_per_key_int = static_cast<ulong>(
3931 innodb_rec_per_key(index, j,
3932 max_rows));
3933
3934 /* Since MySQL seems to favor table scans
3935 too much over index searches, we pretend
3936 index selectivity is 2 times better than
3937 our estimate: */
3938
3939 rec_per_key_int = rec_per_key_int / 2;
3940
3941 if (rec_per_key_int == 0) {
3942 rec_per_key_int = 1;
3943 }
3944
3945 key->rec_per_key[j] = rec_per_key_int;
3946 }
3947 }
3948
3949 if ((flag & HA_STATUS_NO_LOCK) == 0) {
3950 dict_table_stats_unlock(ib_table, RW_S_LATCH);
3951 }
3952
3953 char path[FN_REFLEN];
3954 os_file_stat_t stat_info;
3955 /* Use the first partition for create time until new DD. */
3956 ib_table = m_part_share->get_table_part(0);
3957 my_snprintf(path, sizeof(path), "%s/%s%s",
3958 mysql_data_home,
3959 table->s->normalized_path.str,
3960 reg_ext);
3961
3962 unpack_filename(path,path);
3963
3964 if (os_file_get_status(path, &stat_info, false, true) == DB_SUCCESS) {
3965 stats.create_time = (ulong) stat_info.ctime;
3966 }
3967 }
3968
3969 if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
3970
3971 goto func_exit;
3972 }
3973
3974 if ((flag & HA_STATUS_ERRKEY) != 0) {
3975 const dict_index_t* err_index;
3976
3977 ut_a(m_prebuilt->trx);
3978 ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
3979
3980 err_index = trx_get_error_index(m_prebuilt->trx);
3981
3982 if (err_index != NULL) {
3983 errkey = m_part_share->get_mysql_key(m_last_part,
3984 err_index);
3985 } else {
3986 errkey = (unsigned int) (
3987 (m_prebuilt->trx->error_key_num
3988 == ULINT_UNDEFINED)
3989 ? UINT_MAX
3990 : m_prebuilt->trx->error_key_num);
3991 }
3992 }
3993
3994 if ((flag & HA_STATUS_AUTO) != 0) {
3995 /* auto_inc is only supported in first key for InnoDB! */
3996 ut_ad(table_share->next_number_keypart == 0);
3997 DBUG_PRINT("info", ("HA_STATUS_AUTO"));
3998 if (table->found_next_number_field == NULL) {
3999 stats.auto_increment_value = 0;
4000 } else {
4001 /* Lock to avoid two concurrent initializations. */
4002 lock_auto_increment();
4003 if (m_part_share->auto_inc_initialized) {
4004 stats.auto_increment_value =
4005 m_part_share->next_auto_inc_val;
4006 } else {
4007 /* The auto-inc mutex in the table_share is
4008 locked, so we do not need to have the handlers
4009 locked. */
4010
4011 error = initialize_auto_increment(
4012 (flag & HA_STATUS_NO_LOCK) != 0);
4013 stats.auto_increment_value =
4014 m_part_share->next_auto_inc_val;
4015 }
4016 unlock_auto_increment();
4017 }
4018 }
4019
4020 func_exit:
4021 m_prebuilt->trx->op_info = (char*)"";
4022
4023 DBUG_RETURN(error);
4024 }
4025
4026 /** Optimize table.
4027 This is mapped to "ALTER TABLE tablename ENGINE=InnoDB", which rebuilds
4028 the table in MySQL.
4029 @param[in] thd Connection thread handle.
4030 @param[in] check_opt Currently ignored.
4031 @return 0 for success else error code. */
4032 int
optimize(THD * thd,HA_CHECK_OPT * check_opt)4033 ha_innopart::optimize(
4034 THD* thd,
4035 HA_CHECK_OPT* check_opt)
4036 {
4037 return(HA_ADMIN_TRY_ALTER);
4038 }
4039
4040 /** Checks a partitioned table.
4041 Tries to check that an InnoDB table is not corrupted. If corruption is
4042 noticed, prints to stderr information about it. In case of corruption
4043 may also assert a failure and crash the server. Also checks for records
4044 in wrong partition.
4045 @param[in] thd MySQL THD object/thread handle.
4046 @param[in] check_opt Check options.
4047 @return HA_ADMIN_CORRUPT or HA_ADMIN_OK. */
4048 int
check(THD * thd,HA_CHECK_OPT * check_opt)4049 ha_innopart::check(
4050 THD* thd,
4051 HA_CHECK_OPT* check_opt)
4052 {
4053 uint error = HA_ADMIN_OK;
4054 uint i;
4055
4056 DBUG_ENTER("ha_innopart::check");
4057 /* TODO: Enhance this to:
4058 - Every partition has the same structure.
4059 - The names are correct (partition names checked in ::open()?)
4060 Currently it only does normal InnoDB check of each partition. */
4061
4062 if (set_altered_partitions()) {
4063 ut_ad(0); // Already checked by set_part_state()!
4064 DBUG_RETURN(HA_ADMIN_INVALID);
4065 }
4066 for (i = m_part_info->get_first_used_partition();
4067 i < m_tot_parts;
4068 i = m_part_info->get_next_used_partition(i)) {
4069
4070 m_prebuilt->table = m_part_share->get_table_part(i);
4071 error = ha_innobase::check(thd, check_opt);
4072 if (error != 0) {
4073 break;
4074 }
4075 if ((check_opt->flags & (T_MEDIUM | T_EXTEND)) != 0) {
4076 error = Partition_helper::check_misplaced_rows(i, false);
4077 if (error != 0) {
4078 break;
4079 }
4080 }
4081 }
4082 if (error != 0) {
4083 print_admin_msg(
4084 thd,
4085 256,
4086 "error",
4087 table_share->db.str,
4088 table->alias,
4089 "check",
4090 m_is_sub_partitioned ?
4091 "Subpartition %s returned error"
4092 : "Partition %s returned error",
4093 m_part_share->get_partition_name(i));
4094 }
4095
4096 DBUG_RETURN(error);
4097 }
4098
4099 /** Repair a partitioned table.
4100 Only repairs records in wrong partitions (moves them to the correct
4101 partition or deletes them if not in any partition).
4102 @param[in] thd MySQL THD object/thread handle.
4103 @param[in] repair_opt Repair options.
4104 @return 0 or error code. */
4105 int
repair(THD * thd,HA_CHECK_OPT * repair_opt)4106 ha_innopart::repair(
4107 THD* thd,
4108 HA_CHECK_OPT* repair_opt)
4109 {
4110 uint error = HA_ADMIN_OK;
4111
4112 DBUG_ENTER("ha_innopart::repair");
4113
4114 /* TODO: enable this warning to be clear about what is repaired.
4115 Currently disabled to generate smaller test diffs. */
4116 #ifdef ADD_WARNING_FOR_REPAIR_ONLY_PARTITION
4117 push_warning_printf(thd, Sql_condition::SL_WARNING,
4118 ER_ILLEGAL_HA,
4119 "Only moving rows from wrong partition to correct"
4120 " partition is supported,"
4121 " repairing InnoDB indexes is not yet supported!");
4122 #endif
4123
4124 /* Only repair partitions for MEDIUM or EXTENDED options. */
4125 if ((repair_opt->flags & (T_MEDIUM | T_EXTEND)) == 0) {
4126 DBUG_RETURN(HA_ADMIN_OK);
4127 }
4128 if (set_altered_partitions()) {
4129 ut_ad(0); // Already checked by set_part_state()!
4130 DBUG_RETURN(HA_ADMIN_INVALID);
4131 }
4132 for (uint i = m_part_info->get_first_used_partition();
4133 i < m_tot_parts;
4134 i = m_part_info->get_next_used_partition(i)) {
4135
4136 /* TODO: Implement and use ha_innobase::repair()! */
4137 error = Partition_helper::check_misplaced_rows(i, true);
4138 if (error != 0) {
4139 print_admin_msg(
4140 thd,
4141 256,
4142 "error",
4143 table_share->db.str,
4144 table->alias,
4145 "repair",
4146 m_is_sub_partitioned ?
4147 "Subpartition %s returned error"
4148 : "Partition %s returned error",
4149 m_part_share->get_partition_name(i));
4150 break;
4151 }
4152 }
4153
4154 DBUG_RETURN(error);
4155 }
4156
4157 /** Check if possible to switch engine (no foreign keys).
4158 Checks if ALTER TABLE may change the storage engine of the table.
4159 Changing storage engines is not allowed for tables for which there
4160 are foreign key constraints (parent or child tables).
4161 @return true if can switch engines. */
4162 bool
can_switch_engines()4163 ha_innopart::can_switch_engines()
4164 {
4165 bool can_switch;
4166
4167 DBUG_ENTER("ha_innopart::can_switch_engines");
4168 can_switch = ha_innobase::can_switch_engines();
4169 ut_ad(can_switch);
4170
4171 DBUG_RETURN(can_switch);
4172 }
4173
4174 /** Checks if a table is referenced by a foreign key.
4175 The MySQL manual states that a REPLACE is either equivalent to an INSERT,
4176 or DELETE(s) + INSERT. Only a delete is then allowed internally to resolve
4177 a duplicate key conflict in REPLACE, not an update.
4178 @return > 0 if referenced by a FOREIGN KEY. */
4179 uint
referenced_by_foreign_key()4180 ha_innopart::referenced_by_foreign_key()
4181 {
4182 if (dict_table_is_referenced_by_foreign_key(m_prebuilt->table)) {
4183
4184 #ifndef HA_INNOPART_SUPPORTS_FOREIGN_KEYS
4185 ut_ad(0);
4186 #endif /* HA_INNOPART_SUPPORTS_FOREIGN_KEYS */
4187 return(1);
4188 }
4189
4190 return(0);
4191 }
4192
4193 /** Start statement.
4194 MySQL calls this function at the start of each SQL statement inside LOCK
4195 TABLES. Inside LOCK TABLES the ::external_lock method does not work to
4196 mark SQL statement borders. Note also a special case: if a temporary table
4197 is created inside LOCK TABLES, MySQL has not called external_lock() at all
4198 on that table.
4199 MySQL-5.0 also calls this before each statement in an execution of a stored
4200 procedure. To make the execution more deterministic for binlogging, MySQL-5.0
4201 locks all tables involved in a stored procedure with full explicit table
4202 locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the
4203 procedure.
4204 @param[in] thd Handle to the user thread.
4205 @param[in] lock_type Lock type.
4206 @return 0 or error code. */
4207 int
start_stmt(THD * thd,thr_lock_type lock_type)4208 ha_innopart::start_stmt(
4209 THD* thd,
4210 thr_lock_type lock_type)
4211 {
4212 int error = 0;
4213
4214 if (m_part_info->get_first_used_partition() == MY_BIT_NONE) {
4215 /* All partitions pruned away, do nothing! */
4216 return(error);
4217 }
4218
4219 error = ha_innobase::start_stmt(thd, lock_type);
4220 if (m_prebuilt->sql_stat_start) {
4221 memset(m_sql_stat_start_parts, 0xff,
4222 UT_BITS_IN_BYTES(m_tot_parts));
4223 } else {
4224 memset(m_sql_stat_start_parts, 0,
4225 UT_BITS_IN_BYTES(m_tot_parts));
4226 }
4227 return(error);
4228 }
4229
4230 /** Function to store lock for all partitions in native partitioned table. Also
4231 look at ha_innobase::store_lock for more details.
4232 @param[in] thd user thread handle
4233 @param[in] to pointer to the current element in an array of
4234 pointers to lock structs
4235 @param[in] lock_type lock type to store in 'lock'; this may also be
4236 TL_IGNORE
4237 @retval to pointer to the current element in the 'to' array */
4238 THR_LOCK_DATA**
store_lock(THD * thd,THR_LOCK_DATA ** to,thr_lock_type lock_type)4239 ha_innopart::store_lock(
4240 THD* thd,
4241 THR_LOCK_DATA** to,
4242 thr_lock_type lock_type)
4243 {
4244 trx_t* trx = m_prebuilt->trx;
4245 const uint sql_command = thd_sql_command(thd);
4246
4247 ha_innobase::store_lock(thd, to, lock_type);
4248
4249 if (sql_command == SQLCOM_FLUSH
4250 && lock_type == TL_READ_NO_INSERT) {
4251 for (uint i = 1; i < m_tot_parts; i++) {
4252 dict_table_t* table = m_part_share->get_table_part(i);
4253
4254 dberr_t err = row_quiesce_set_state(
4255 table, QUIESCE_START, trx);
4256 ut_a(err == DB_SUCCESS || err == DB_UNSUPPORTED);
4257 }
4258 }
4259
4260 return to;
4261 }
4262
4263 /** Lock/prepare to lock table.
4264 As MySQL will execute an external lock for every new table it uses when it
4265 starts to process an SQL statement (an exception is when MySQL calls
4266 start_stmt for the handle) we can use this function to store the pointer to
4267 the THD in the handle. We will also use this function to communicate
4268 to InnoDB that a new SQL statement has started and that we must store a
4269 savepoint to our transaction handle, so that we are able to roll back
4270 the SQL statement in case of an error.
4271 @param[in] thd Handle to the user thread.
4272 @param[in] lock_type Lock type.
4273 @return 0 or error number. */
4274 int
external_lock(THD * thd,int lock_type)4275 ha_innopart::external_lock(
4276 THD* thd,
4277 int lock_type)
4278 {
4279 int error = 0;
4280
4281 if (m_part_info->get_first_used_partition() == MY_BIT_NONE
4282 && !(m_mysql_has_locked
4283 && lock_type == F_UNLCK)) {
4284
4285 /* All partitions pruned away, do nothing! */
4286 ut_ad(!m_mysql_has_locked);
4287 return(error);
4288 }
4289 ut_ad(m_mysql_has_locked || lock_type != F_UNLCK);
4290
4291 m_prebuilt->table = m_part_share->get_table_part(0);
4292 error = ha_innobase::external_lock(thd, lock_type);
4293
4294 for (uint i = 0; i < m_tot_parts; i++) {
4295 dict_table_t* table = m_part_share->get_table_part(i);
4296
4297 switch (table->quiesce) {
4298 case QUIESCE_START:
4299 /* Check for FLUSH TABLE t WITH READ LOCK */
4300 if (!srv_read_only_mode
4301 && thd_sql_command(thd) == SQLCOM_FLUSH
4302 && lock_type == F_RDLCK) {
4303
4304 ut_ad(table->quiesce == QUIESCE_START);
4305
4306 if (dict_table_is_discarded(table)) {
4307 ib_senderrf(m_prebuilt->trx->mysql_thd,
4308 IB_LOG_LEVEL_ERROR,
4309 ER_TABLESPACE_DISCARDED,
4310 table->name.m_name);
4311
4312 return (HA_ERR_NO_SUCH_TABLE);
4313 }
4314
4315 row_quiesce_table_start(table,
4316 m_prebuilt->trx);
4317
4318 /* Use the transaction instance to track
4319 UNLOCK TABLES. It can be done via START
4320 TRANSACTION; too implicitly. */
4321
4322 ++m_prebuilt->trx->flush_tables;
4323 }
4324 break;
4325
4326 case QUIESCE_COMPLETE:
4327 /* Check for UNLOCK TABLES; implicit or explicit
4328 or trx interruption. */
4329 if (m_prebuilt->trx->flush_tables > 0
4330 && (lock_type == F_UNLCK
4331 || trx_is_interrupted(m_prebuilt->trx))) {
4332
4333 ut_ad(table->quiesce == QUIESCE_COMPLETE);
4334 row_quiesce_table_complete(table,
4335 m_prebuilt->trx);
4336
4337 ut_a(m_prebuilt->trx->flush_tables > 0);
4338 --m_prebuilt->trx->flush_tables;
4339 }
4340 break;
4341
4342 case QUIESCE_NONE:
4343 break;
4344
4345 default:
4346 ut_ad(0);
4347 }
4348 }
4349
4350 ut_ad(!m_auto_increment_lock);
4351 ut_ad(!m_auto_increment_safe_stmt_log_lock);
4352
4353 if (m_prebuilt->sql_stat_start) {
4354 memset(m_sql_stat_start_parts, 0xff,
4355 UT_BITS_IN_BYTES(m_tot_parts));
4356 } else {
4357 memset(m_sql_stat_start_parts, 0,
4358 UT_BITS_IN_BYTES(m_tot_parts));
4359 }
4360 return(error);
4361 }
4362
4363 /** Get the current auto_increment value.
4364 @param[in] offset Table auto-inc offset.
4365 @param[in] increment Table auto-inc increment.
4366 @param[in] nb_desired_values Number of required values.
4367 @param[out] first_value The auto increment value.
4368 @param[out] nb_reserved_values Number of reserved values.
4369 @return Auto increment value, or ~0 on failure. */
4370 void
get_auto_increment(ulonglong offset,ulonglong increment,ulonglong nb_desired_values,ulonglong * first_value,ulonglong * nb_reserved_values)4371 ha_innopart::get_auto_increment(
4372 ulonglong offset,
4373 ulonglong increment,
4374 ulonglong nb_desired_values,
4375 ulonglong* first_value,
4376 ulonglong* nb_reserved_values)
4377 {
4378 DBUG_ENTER("ha_innopart::get_auto_increment");
4379 if (table_share->next_number_keypart != 0) {
4380 /* Only first key part allowed as autoinc for InnoDB tables! */
4381 ut_ad(0);
4382 *first_value = ULLONG_MAX;
4383 DBUG_VOID_RETURN;
4384 }
4385 get_auto_increment_first_field(
4386 increment,
4387 nb_desired_values,
4388 first_value,
4389 nb_reserved_values);
4390 DBUG_VOID_RETURN;
4391 }
4392
4393 /** Get partition row type
4394 @param[in] Id of partition for which row type to be retrieved
4395 @return Partition row type */
get_partition_row_type(uint part_id)4396 enum row_type ha_innopart::get_partition_row_type(
4397 uint part_id)
4398 {
4399 set_partition(part_id);
4400 return get_row_type();
4401 }
4402
4403 /** Compares two 'refs'.
4404 A 'ref' is the (internal) primary key value of the row.
4405 If there is no explicitly declared non-null unique key or a primary key, then
4406 InnoDB internally uses the row id as the primary key.
4407 It will use the partition id as secondary compare.
4408 @param[in] ref1 An (internal) primary key value in the MySQL key value
4409 format.
4410 @param[in] ref2 Reference to compare with (same type as ref1).
4411 @return < 0 if ref1 < ref2, 0 if equal, else > 0. */
4412 int
cmp_ref(const uchar * ref1,const uchar * ref2)4413 ha_innopart::cmp_ref(
4414 const uchar* ref1,
4415 const uchar* ref2)
4416 {
4417 int cmp;
4418
4419 cmp = ha_innobase::cmp_ref(ref1 + PARTITION_BYTES_IN_POS,
4420 ref2 + PARTITION_BYTES_IN_POS);
4421
4422 if (cmp != 0) {
4423 return(cmp);
4424 }
4425
4426 cmp = static_cast<int>(uint2korr(ref1))
4427 - static_cast<int>(uint2korr(ref2));
4428
4429 return(cmp);
4430 }
4431
4432 /** Prepare for creating new partitions during ALTER TABLE ... PARTITION.
4433 @param[in] num_partitions Number of new partitions to be created.
4434 @param[in] only_create True if only creating the partition
4435 (no open/lock is needed).
4436 @return 0 for success else error code. */
4437 int
prepare_for_new_partitions(uint num_partitions,bool only_create)4438 ha_innopart::prepare_for_new_partitions(
4439 uint num_partitions,
4440 bool only_create)
4441 {
4442 m_new_partitions = UT_NEW(Altered_partitions(num_partitions,
4443 only_create),
4444 mem_key_partitioning);
4445 if (m_new_partitions == NULL) {
4446 return(HA_ERR_OUT_OF_MEM);
4447 }
4448 if (m_new_partitions->initialize()) {
4449 UT_DELETE(m_new_partitions);
4450 m_new_partitions = NULL;
4451 return(HA_ERR_OUT_OF_MEM);
4452 }
4453 return(0);
4454 }
4455
4456 /** Create a new partition to be filled during ALTER TABLE ... PARTITION.
4457 @param[in] table Table to create the partition in.
4458 @param[in] create_info Table/partition specific create info.
4459 @param[in] part_name Partition name.
4460 @param[in] new_part_id Partition id in new table.
4461 @param[in] part_elem Partition element.
4462 @return 0 for success else error code. */
4463 int
create_new_partition(TABLE * table,HA_CREATE_INFO * create_info,const char * part_name,uint new_part_id,partition_element * part_elem)4464 ha_innopart::create_new_partition(
4465 TABLE* table,
4466 HA_CREATE_INFO* create_info,
4467 const char* part_name,
4468 uint new_part_id,
4469 partition_element* part_elem)
4470 {
4471 int error;
4472 char norm_name[FN_REFLEN];
4473 const char* tablespace_name_backup = create_info->tablespace;
4474 const char* data_file_name_backup = create_info->data_file_name;
4475 DBUG_ENTER("ha_innopart::create_new_partition");
4476 /* Delete by ddl_log on failure. */
4477 normalize_table_name(norm_name, part_name);
4478 set_create_info_dir(part_elem, create_info);
4479
4480 /* The below check is the same as for CREATE TABLE, but since we are
4481 doing an alter here it will not trigger the check in
4482 create_option_tablespace_is_valid(). */
4483 if (tablespace_is_shared_space(create_info)
4484 && create_info->data_file_name != NULL
4485 && create_info->data_file_name[0] != '\0') {
4486 my_printf_error(ER_ILLEGAL_HA_CREATE_OPTION,
4487 "InnoDB: DATA DIRECTORY cannot be used"
4488 " with a TABLESPACE assignment.", MYF(0));
4489 DBUG_RETURN(HA_WRONG_CREATE_OPTION);
4490 }
4491
4492 if (tablespace_is_shared_space(create_info)) {
4493 push_deprecated_warn_no_replacement(
4494 ha_thd(), PARTITION_IN_SHARED_TABLESPACE_WARNING);
4495 }
4496
4497 error = ha_innobase::create(norm_name, table, create_info);
4498 create_info->tablespace = tablespace_name_backup;
4499 create_info->data_file_name = data_file_name_backup;
4500 if (error == HA_ERR_FOUND_DUPP_KEY) {
4501 DBUG_RETURN(HA_ERR_TABLE_EXIST);
4502 }
4503 if (error != 0) {
4504 DBUG_RETURN(error);
4505 }
4506 if (!m_new_partitions->only_create())
4507 {
4508 dict_table_t* part;
4509 part = dict_table_open_on_name(norm_name,
4510 false,
4511 true,
4512 DICT_ERR_IGNORE_NONE);
4513 if (part == NULL) {
4514 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4515 }
4516 m_new_partitions->set_part(new_part_id, part);
4517 }
4518 DBUG_RETURN(0);
4519 }
4520
4521 /** Close and finalize new partitions. */
4522 void
close_new_partitions()4523 ha_innopart::close_new_partitions()
4524 {
4525 if (m_new_partitions != NULL) {
4526 UT_DELETE(m_new_partitions);
4527 m_new_partitions = NULL;
4528 }
4529 }
4530
4531 /** write row to new partition.
4532 @param[in] new_part New partition to write to.
4533 @return 0 for success else error code. */
4534 int
write_row_in_new_part(uint new_part)4535 ha_innopart::write_row_in_new_part(
4536 uint new_part)
4537 {
4538 int result;
4539 DBUG_ENTER("ha_innopart::write_row_in_new_part");
4540
4541 m_last_part = new_part;
4542 if (m_new_partitions->part(new_part) == NULL) {
4543 /* Altered partition contains misplaced row. */
4544 m_err_rec = table->record[0];
4545 DBUG_RETURN(HA_ERR_ROW_IN_WRONG_PARTITION);
4546 }
4547 m_new_partitions->get_prebuilt(m_prebuilt, new_part);
4548 result = ha_innobase::write_row(table->record[0]);
4549 m_new_partitions->set_from_prebuilt(m_prebuilt, new_part);
4550 DBUG_RETURN(result);
4551 }
4552
4553 /** Allocate the array to hold blob heaps for all partitions */
4554 mem_heap_t**
alloc_blob_heap_array()4555 ha_innopart::alloc_blob_heap_array()
4556 {
4557 DBUG_ENTER("ha_innopart::alloc_blob_heap_array");
4558
4559 const ulint len = sizeof(mem_heap_t*) * m_tot_parts;
4560 m_blob_heap_parts = static_cast<mem_heap_t**>(
4561 ut_zalloc(len, mem_key_partitioning));
4562 if (m_blob_heap_parts == NULL) {
4563 DBUG_RETURN(NULL);
4564 }
4565
4566 DBUG_RETURN(m_blob_heap_parts);
4567 }
4568
4569 /** Free the array that holds blob heaps for all partitions */
4570 void
free_blob_heap_array()4571 ha_innopart::free_blob_heap_array()
4572 {
4573 DBUG_ENTER("ha_innopart::free_blob_heap_array");
4574
4575 if (m_blob_heap_parts != NULL) {
4576 clear_blob_heaps();
4577 ut_free(m_blob_heap_parts);
4578 m_blob_heap_parts = NULL;
4579 }
4580
4581 DBUG_VOID_RETURN;
4582 }
4583
4584 void
clear_blob_heaps()4585 ha_innopart::clear_blob_heaps()
4586 {
4587 DBUG_ENTER("ha_innopart::clear_blob_heaps");
4588
4589 if (m_blob_heap_parts == NULL) {
4590 DBUG_VOID_RETURN;
4591 }
4592
4593 for (uint i = 0; i < m_tot_parts; i++) {
4594 if (m_blob_heap_parts[i] != NULL) {
4595 DBUG_PRINT("ha_innopart", ("freeing blob_heap: %p",
4596 m_blob_heap_parts[i]));
4597 mem_heap_free(m_blob_heap_parts[i]);
4598 m_blob_heap_parts[i] = NULL;
4599 }
4600 }
4601
4602 /* Reset blob_heap in m_prebuilt after freeing all heaps. It is set in
4603 ha_innopart::set_partition to the blob heap of current partition. */
4604 m_prebuilt->blob_heap = NULL;
4605
4606 DBUG_VOID_RETURN;
4607 }
4608
4609 /** Reset state of file to after 'open'. This function is called
4610 after every statement for all tables used by that statement. */
4611 int
reset()4612 ha_innopart::reset()
4613 {
4614 DBUG_ENTER("ha_innopart::reset");
4615
4616 clear_blob_heaps();
4617
4618 DBUG_RETURN(ha_innobase::reset());
4619 }
4620
4621 /**
4622 Read row using position using given record to find.
4623
4624 This works as position()+rnd_pos() functions, but does some
4625 extra work,calculating m_last_part - the partition to where
4626 the 'record' should go. Only useful when position is based
4627 on primary key (HA_PRIMARY_KEY_REQUIRED_FOR_POSITION).
4628
4629 @param[in] record Current record in MySQL Row Format.
4630 @return 0 for success else error code. */
4631 int
rnd_pos_by_record(uchar * record)4632 ha_innopart::rnd_pos_by_record(uchar* record)
4633 {
4634 int error;
4635 DBUG_ENTER("ha_innopart::rnd_pos_by_record");
4636 assert(ha_table_flags() &
4637 HA_PRIMARY_KEY_REQUIRED_FOR_POSITION);
4638 /* TODO: Support HA_READ_BEFORE_WRITE_REMOVAL */
4639 /* Set m_last_part correctly. */
4640 if (unlikely(get_part_for_delete(record,
4641 m_table->record[0],
4642 m_part_info,
4643 &m_last_part))) {
4644 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
4645 }
4646
4647 /* Init only the partition in which row resides */
4648 error = rnd_init_in_part(m_last_part, false);
4649 if (error != 0) {
4650 goto err;
4651 }
4652
4653 position(record);
4654 error = handler::ha_rnd_pos(record, ref);
4655 err:
4656 rnd_end_in_part(m_last_part,FALSE);
4657 DBUG_RETURN(error);
4658 }
4659
4660 /****************************************************************************
4661 * DS-MRR implementation
4662 ***************************************************************************/
4663
4664 /* TODO: move the default implementations into the base handler class! */
4665 /* TODO: See if it could be optimized for partitioned tables? */
4666 /* Use default ha_innobase implementation for now... */
4667