1 /*
2    Copyright (c) 2005, 2019, Oracle and/or its affiliates. All rights reserved.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License, version 2.0,
6    as published by the Free Software Foundation.
7 
8    This program is also distributed with certain software (including
9    but not limited to OpenSSL) that is licensed under separate terms,
10    as designated in a particular file or component or in included license
11    documentation.  The authors of MySQL hereby grant you an additional
12    permission to link the program and your derivative works with the
13    separately licensed software that they have included with MySQL.
14 
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License, version 2.0, for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
23 */
24 
25 /*
26   This handler was developed by Mikael Ronstrom for version 5.1 of MySQL.
27   It is an abstraction layer on top of other handlers such as MyISAM,
28   InnoDB, Federated, Berkeley DB and so forth. Partitioned tables can also
29   be handled by a storage engine. The current example of this is NDB
30   Cluster that has internally handled partitioning. This have benefits in
31   that many loops needed in the partition handler can be avoided.
32 
33   Partitioning has an inherent feature which in some cases is positive and
34   in some cases is negative. It splits the data into chunks. This makes
35   the data more manageable, queries can easily be parallelised towards the
36   parts and indexes are split such that there are less levels in the
37   index trees. The inherent disadvantage is that to use a split index
38   one has to scan all index parts which is ok for large queries but for
39   small queries it can be a disadvantage.
40 
41   Partitioning lays the foundation for more manageable databases that are
42   extremely large. It does also lay the foundation for more parallelism
43   in the execution of queries. This functionality will grow with later
44   versions of MySQL.
45 
46   You can enable it in your buld by doing the following during your build
47   process:
48   ./configure --with-partition
49 
50   The partition is setup to use table locks. It implements an partition "SHARE"
51   that is inserted into a hash by table name. You can use this to store
52   information of state that any partition handler object will be able to see
53   if it is using the same table.
54 
55   Please read the object definition in ha_partition.h before reading the rest
56   if this file.
57 */
58 
59 #include "sql_priv.h"
60 #include "sql_parse.h"                          // append_file_to_dir
61 #include "binlog.h"                             // mysql_bin_log
62 
63 #ifdef WITH_PARTITION_STORAGE_ENGINE
64 #include "ha_partition.h"
65 #include "sql_table.h"                        // tablename_to_filename
66 #include "key.h"
67 #include "sql_plugin.h"
68 #include "sql_partition.h"
69 #include "sql_show.h"                        // append_identifier
70 #include "sql_admin.h"                       // SQL_ADMIN_MSG_TEXT_SIZE
71 
72 #include "debug_sync.h"
73 
74 using std::min;
75 using std::max;
76 
77 
78 /* First 4 bytes in the .par file is the number of 32-bit words in the file */
79 #define PAR_WORD_SIZE 4
80 /* offset to the .par file checksum */
81 #define PAR_CHECKSUM_OFFSET 4
82 /* offset to the total number of partitions */
83 #define PAR_NUM_PARTS_OFFSET 8
84 /* offset to the engines array */
85 #define PAR_ENGINES_OFFSET 12
86 #define PARTITION_ENABLED_TABLE_FLAGS (HA_FILE_BASED | \
87                                        HA_REC_NOT_IN_SEQ | \
88                                        HA_CAN_REPAIR)
89 #define PARTITION_DISABLED_TABLE_FLAGS (HA_CAN_GEOMETRY | \
90                                         HA_CAN_FULLTEXT | \
91                                         HA_DUPLICATE_POS | \
92                                         HA_CAN_SQL_HANDLER | \
93                                         HA_CAN_INSERT_DELAYED | \
94                                         HA_READ_BEFORE_WRITE_REMOVAL)
95 static const char *ha_par_ext= ".par";
96 
97 /****************************************************************************
98                 MODULE create/delete handler object
99 ****************************************************************************/
100 
101 static handler *partition_create_handler(handlerton *hton,
102                                          TABLE_SHARE *share,
103                                          MEM_ROOT *mem_root);
104 static uint partition_flags();
105 static uint alter_table_flags(uint flags);
106 
107 
108 /****************************************************************************
109     Check whether the partition column order changes after alter
110 ****************************************************************************/
111 static bool check_partition_column_order(List<Create_field> *create_list,
112                                          Field** field_arary);
113 
114 #ifdef HAVE_PSI_INTERFACE
115 PSI_mutex_key key_partition_auto_inc_mutex;
116 
117 static PSI_mutex_info all_partition_mutexes[]=
118 {
119   { &key_partition_auto_inc_mutex, "Partition_share::auto_inc_mutex", 0}
120 };
121 
init_partition_psi_keys(void)122 static void init_partition_psi_keys(void)
123 {
124   const char* category= "partition";
125   int count;
126 
127   count= array_elements(all_partition_mutexes);
128   mysql_mutex_register(category, all_partition_mutexes, count);
129 }
130 #endif /* HAVE_PSI_INTERFACE */
131 
partition_initialize(void * p)132 static int partition_initialize(void *p)
133 {
134 
135   handlerton *partition_hton;
136   partition_hton= (handlerton *)p;
137 
138   partition_hton->state= SHOW_OPTION_YES;
139   partition_hton->db_type= DB_TYPE_PARTITION_DB;
140   partition_hton->create= partition_create_handler;
141   partition_hton->partition_flags= partition_flags;
142   partition_hton->alter_table_flags= alter_table_flags;
143   partition_hton->flags= HTON_NOT_USER_SELECTABLE |
144                          HTON_HIDDEN |
145                          HTON_TEMPORARY_NOT_SUPPORTED;
146 #ifdef HAVE_PSI_INTERFACE
147   init_partition_psi_keys();
148 #endif
149   return 0;
150 }
151 
152 
153 /**
154   Initialize and allocate space for partitions shares.
155 
156   @param num_parts  Number of partitions to allocate storage for.
157 
158   @return Operation status.
159     @retval true  Failure (out of memory).
160     @retval false Success.
161 */
162 
init(uint num_parts)163 bool Partition_share::init(uint num_parts)
164 {
165   DBUG_ENTER("Partition_share::init");
166   mysql_mutex_init(key_partition_auto_inc_mutex,
167                    &auto_inc_mutex,
168                    MY_MUTEX_INIT_FAST);
169   auto_inc_initialized= false;
170   partition_name_hash_initialized= false;
171   next_auto_inc_val= 0;
172   partitions_share_refs= new Parts_share_refs;
173   if (!partitions_share_refs)
174     DBUG_RETURN(true);
175   if (partitions_share_refs->init(num_parts))
176   {
177     delete partitions_share_refs;
178     DBUG_RETURN(true);
179   }
180   DBUG_RETURN(false);
181 }
182 
183 
184 /*
185   Create new partition handler
186 
187   SYNOPSIS
188     partition_create_handler()
189     table                       Table object
190 
191   RETURN VALUE
192     New partition object
193 */
194 
partition_create_handler(handlerton * hton,TABLE_SHARE * share,MEM_ROOT * mem_root)195 static handler *partition_create_handler(handlerton *hton,
196                                          TABLE_SHARE *share,
197                                          MEM_ROOT *mem_root)
198 {
199   ha_partition *file= new (mem_root) ha_partition(hton, share);
200   if (file && file->initialize_partition(mem_root))
201   {
202     delete file;
203     file= 0;
204   }
205   return file;
206 }
207 
208 /*
209   HA_CAN_PARTITION:
210   Used by storage engines that can handle partitioning without this
211   partition handler
212   (Partition, NDB)
213 
214   HA_CAN_UPDATE_PARTITION_KEY:
215   Set if the handler can update fields that are part of the partition
216   function.
217 
218   HA_CAN_PARTITION_UNIQUE:
219   Set if the handler can handle unique indexes where the fields of the
220   unique key are not part of the fields of the partition function. Thus
221   a unique key can be set on all fields.
222 
223   HA_USE_AUTO_PARTITION
224   Set if the handler sets all tables to be partitioned by default.
225 */
226 
partition_flags()227 static uint partition_flags()
228 {
229   return HA_CAN_PARTITION;
230 }
231 
alter_table_flags(uint flags MY_ATTRIBUTE ((unused)))232 static uint alter_table_flags(uint flags MY_ATTRIBUTE((unused)))
233 {
234   return (HA_PARTITION_FUNCTION_SUPPORTED |
235           HA_FAST_CHANGE_PARTITION);
236 }
237 
check_partition_column_order(List<Create_field> * create_list,Field ** field_arary)238 static bool check_partition_column_order(List<Create_field> *create_list,
239                                          Field** field_arary)
240 {
241 
242   Field **f_ptr;
243   List_iterator_fast<Create_field> new_field_it;
244   Create_field *new_field= NULL;
245   new_field_it.init(*create_list);
246 
247   for (f_ptr= field_arary ; *f_ptr; f_ptr++)
248   {
249     while ((new_field= new_field_it++))
250     {
251       if (new_field->field == *f_ptr)
252         break;
253     }
254     if (!new_field)
255       break;
256   }
257 
258   if (!new_field)
259   {
260     /* Not same order, INPLACE cannot be allowed!*/
261     return false;
262   }
263   return true;
264 }
265 
266 const uint32 ha_partition::NO_CURRENT_PART_ID= NOT_A_PARTITION_ID;
267 
268 /*
269   Constructor method
270 
271   SYNOPSIS
272     ha_partition()
273     table                       Table object
274 
275   RETURN VALUE
276     NONE
277 */
278 
ha_partition(handlerton * hton,TABLE_SHARE * share)279 ha_partition::ha_partition(handlerton *hton, TABLE_SHARE *share)
280   :handler(hton, share)
281 {
282   DBUG_ENTER("ha_partition::ha_partition(table)");
283   init_handler_variables();
284   DBUG_VOID_RETURN;
285 }
286 
287 
288 /*
289   Constructor method
290 
291   SYNOPSIS
292     ha_partition()
293     part_info                       Partition info
294 
295   RETURN VALUE
296     NONE
297 */
298 
ha_partition(handlerton * hton,partition_info * part_info)299 ha_partition::ha_partition(handlerton *hton, partition_info *part_info)
300   :handler(hton, NULL)
301 {
302   DBUG_ENTER("ha_partition::ha_partition(part_info)");
303   DBUG_ASSERT(part_info);
304   init_handler_variables();
305   m_part_info= part_info;
306   m_create_handler= TRUE;
307   m_is_sub_partitioned= m_part_info->is_sub_partitioned();
308   DBUG_VOID_RETURN;
309 }
310 
311 /**
312   ha_partition constructor method used by ha_partition::clone()
313 
314   @param hton               Handlerton (partition_hton)
315   @param share              Table share object
316   @param part_info_arg      partition_info to use
317   @param clone_arg          ha_partition to clone
318   @param clme_mem_root_arg  MEM_ROOT to use
319 
320   @return New partition handler
321 */
322 
ha_partition(handlerton * hton,TABLE_SHARE * share,partition_info * part_info_arg,ha_partition * clone_arg,MEM_ROOT * clone_mem_root_arg)323 ha_partition::ha_partition(handlerton *hton, TABLE_SHARE *share,
324                            partition_info *part_info_arg,
325                            ha_partition *clone_arg,
326                            MEM_ROOT *clone_mem_root_arg)
327   :handler(hton, share)
328 {
329   DBUG_ENTER("ha_partition::ha_partition(clone)");
330   init_handler_variables();
331   m_part_info= part_info_arg;
332   m_create_handler= TRUE;
333   m_is_sub_partitioned= m_part_info->is_sub_partitioned();
334   m_is_clone_of= clone_arg;
335   m_clone_mem_root= clone_mem_root_arg;
336   part_share= clone_arg->part_share;
337   m_tot_parts= clone_arg->m_tot_parts;
338   m_pkey_is_clustered= clone_arg->primary_key_is_clustered();
339   DBUG_VOID_RETURN;
340 }
341 
342 /*
343   Initialize handler object
344 
345   SYNOPSIS
346     init_handler_variables()
347 
348   RETURN VALUE
349     NONE
350 */
351 
init_handler_variables()352 void ha_partition::init_handler_variables()
353 {
354   active_index= MAX_KEY;
355   m_mode= 0;
356   m_open_test_lock= 0;
357   m_file_buffer= NULL;
358   m_name_buffer_ptr= NULL;
359   m_engine_array= NULL;
360   m_file= NULL;
361   m_file_tot_parts= 0;
362   m_reorged_file= NULL;
363   m_new_file= NULL;
364   m_reorged_parts= 0;
365   m_added_file= NULL;
366   m_tot_parts= 0;
367   m_pkey_is_clustered= 0;
368   m_part_spec.start_part= NO_CURRENT_PART_ID;
369   m_scan_value= 2;
370   m_ref_length= 0;
371   m_part_spec.end_part= NO_CURRENT_PART_ID;
372   m_index_scan_type= partition_no_index_scan;
373   m_start_key.key= NULL;
374   m_start_key.length= 0;
375   m_myisam= FALSE;
376   m_innodb= FALSE;
377   m_extra_cache= FALSE;
378   m_extra_cache_size= 0;
379   m_extra_prepare_for_update= FALSE;
380   m_extra_cache_part_id= NO_CURRENT_PART_ID;
381   m_handler_status= handler_not_initialized;
382   m_low_byte_first= 1;
383   m_part_field_array= NULL;
384   m_ordered_rec_buffer= NULL;
385   m_top_entry= NO_CURRENT_PART_ID;
386   m_rec_length= 0;
387   m_last_part= 0;
388   m_rec0= 0;
389   m_err_rec= NULL;
390   m_curr_key_info[0]= NULL;
391   m_curr_key_info[1]= NULL;
392   m_part_func_monotonicity_info= NON_MONOTONIC;
393   auto_increment_lock= FALSE;
394   auto_increment_safe_stmt_log_lock= FALSE;
395   /*
396     this allows blackhole to work properly
397   */
398   m_num_locks= 0;
399   m_part_info= NULL;
400   m_create_handler= FALSE;
401   m_is_sub_partitioned= 0;
402   m_is_clone_of= NULL;
403   m_clone_mem_root= NULL;
404   part_share= NULL;
405   m_new_partitions_share_refs.empty();
406   m_part_ids_sorted_by_num_of_records= NULL;
407   m_sec_sort_by_rowid= false;
408 
409 #ifdef DONT_HAVE_TO_BE_INITALIZED
410   m_start_key.flag= 0;
411   m_ordered= TRUE;
412 #endif
413 }
414 
415 
table_type() const416 const char *ha_partition::table_type() const
417 {
418   // we can do this since we only support a single engine type
419   return m_file[0]->table_type();
420 }
421 
422 
423 /*
424   Destructor method
425 
426   SYNOPSIS
427     ~ha_partition()
428 
429   RETURN VALUE
430     NONE
431 */
432 
~ha_partition()433 ha_partition::~ha_partition()
434 {
435   DBUG_ENTER("ha_partition::~ha_partition()");
436   if (m_new_partitions_share_refs.elements)
437     m_new_partitions_share_refs.delete_elements();
438   if (m_file != NULL)
439   {
440     uint i;
441     for (i= 0; i < m_tot_parts; i++)
442       delete m_file[i];
443   }
444   destroy_record_priority_queue();
445   my_free(m_part_ids_sorted_by_num_of_records);
446 
447   clear_handler_file();
448   DBUG_VOID_RETURN;
449 }
450 
451 
452 /*
453   Initialize partition handler object
454 
455   SYNOPSIS
456     initialize_partition()
457     mem_root			Allocate memory through this
458 
459   RETURN VALUE
460     1                         Error
461     0                         Success
462 
463   DESCRIPTION
464 
465   The partition handler is only a layer on top of other engines. Thus it
466   can't really perform anything without the underlying handlers. Thus we
467   add this method as part of the allocation of a handler object.
468 
469   1) Allocation of underlying handlers
470      If we have access to the partition info we will allocate one handler
471      instance for each partition.
472   2) Allocation without partition info
473      The cases where we don't have access to this information is when called
474      in preparation for delete_table and rename_table and in that case we
475      only need to set HA_FILE_BASED. In that case we will use the .par file
476      that contains information about the partitions and their engines and
477      the names of each partition.
478   3) Table flags initialisation
479      We need also to set table flags for the partition handler. This is not
480      static since it depends on what storage engines are used as underlying
481      handlers.
482      The table flags is set in this routine to simulate the behaviour of a
483      normal storage engine
484      The flag HA_FILE_BASED will be set independent of the underlying handlers
485   4) Index flags initialisation
486      When knowledge exists on the indexes it is also possible to initialize the
487      index flags. Again the index flags must be initialized by using the under-
488      lying handlers since this is storage engine dependent.
489      The flag HA_READ_ORDER will be reset for the time being to indicate no
490      ordered output is available from partition handler indexes. Later a merge
491      sort will be performed using the underlying handlers.
492   5) primary_key_is_clustered, has_transactions and low_byte_first is
493      calculated here.
494 
495 */
496 
initialize_partition(MEM_ROOT * mem_root)497 bool ha_partition::initialize_partition(MEM_ROOT *mem_root)
498 {
499   handler **file_array, *file;
500   ulonglong check_table_flags;
501   DBUG_ENTER("ha_partition::initialize_partition");
502 
503   if (m_create_handler)
504   {
505     m_tot_parts= m_part_info->get_tot_partitions();
506     DBUG_ASSERT(m_tot_parts > 0);
507     if (new_handlers_from_part_info(mem_root))
508       DBUG_RETURN(1);
509   }
510   else if (!table_share || !table_share->normalized_path.str)
511   {
512     /*
513       Called with dummy table share (delete, rename and alter table).
514       Don't need to set-up anything.
515     */
516     DBUG_RETURN(0);
517   }
518   else if (get_from_handler_file(table_share->normalized_path.str,
519                                  mem_root, false))
520   {
521     my_error(ER_FAILED_READ_FROM_PAR_FILE, MYF(0));
522     DBUG_RETURN(1);
523   }
524   /*
525     We create all underlying table handlers here. We do it in this special
526     method to be able to report allocation errors.
527 
528     Set up low_byte_first, primary_key_is_clustered and
529     has_transactions since they are called often in all kinds of places,
530     other parameters are calculated on demand.
531     Verify that all partitions have the same table_flags.
532   */
533   check_table_flags= m_file[0]->ha_table_flags();
534   m_low_byte_first= m_file[0]->low_byte_first();
535   m_pkey_is_clustered= TRUE;
536   file_array= m_file;
537   do
538   {
539     file= *file_array;
540     if (m_low_byte_first != file->low_byte_first())
541     {
542       // Cannot have handlers with different endian
543       my_error(ER_MIX_HANDLER_ERROR, MYF(0));
544       DBUG_RETURN(1);
545     }
546     if (!file->primary_key_is_clustered())
547       m_pkey_is_clustered= FALSE;
548     if (check_table_flags != file->ha_table_flags())
549     {
550       my_error(ER_MIX_HANDLER_ERROR, MYF(0));
551       DBUG_RETURN(1);
552     }
553   } while (*(++file_array));
554   m_handler_status= handler_initialized;
555   DBUG_RETURN(0);
556 }
557 
558 /****************************************************************************
559                 MODULE meta data changes
560 ****************************************************************************/
561 /*
562   Delete a table
563 
564   SYNOPSIS
565     delete_table()
566     name                    Full path of table name
567 
568   RETURN VALUE
569     >0                        Error
570     0                         Success
571 
572   DESCRIPTION
573     Used to delete a table. By the time delete_table() has been called all
574     opened references to this table will have been closed (and your globally
575     shared references released. The variable name will just be the name of
576     the table. You will need to remove any files you have created at this
577     point.
578 
579     If you do not implement this, the default delete_table() is called from
580     handler.cc and it will delete all files with the file extentions returned
581     by bas_ext().
582 
583     Called from handler.cc by delete_table and  ha_create_table(). Only used
584     during create if the table_flag HA_DROP_BEFORE_CREATE was specified for
585     the storage engine.
586 */
587 
delete_table(const char * name)588 int ha_partition::delete_table(const char *name)
589 {
590   DBUG_ENTER("ha_partition::delete_table");
591 
592   DBUG_RETURN(del_ren_table(name, NULL));
593 }
594 
595 
596 /*
597   Rename a table
598 
599   SYNOPSIS
600     rename_table()
601     from                      Full path of old table name
602     to                        Full path of new table name
603 
604   RETURN VALUE
605     >0                        Error
606     0                         Success
607 
608   DESCRIPTION
609     Renames a table from one name to another from alter table call.
610 
611     If you do not implement this, the default rename_table() is called from
612     handler.cc and it will rename all files with the file extentions returned
613     by bas_ext().
614 
615     Called from sql_table.cc by mysql_rename_table().
616 */
617 
rename_table(const char * from,const char * to)618 int ha_partition::rename_table(const char *from, const char *to)
619 {
620   DBUG_ENTER("ha_partition::rename_table");
621 
622   DBUG_RETURN(del_ren_table(from, to));
623 }
624 
625 
626 /*
627   Create the handler file (.par-file)
628 
629   SYNOPSIS
630     create_handler_files()
631     name                              Full path of table name
632     create_info                       Create info generated for CREATE TABLE
633 
634   RETURN VALUE
635     >0                        Error
636     0                         Success
637 
638   DESCRIPTION
639     create_handler_files is called to create any handler specific files
640     before opening the file with openfrm to later call ::create on the
641     file object.
642     In the partition handler this is used to store the names of partitions
643     and types of engines in the partitions.
644 */
645 
create_handler_files(const char * path,const char * old_path,int action_flag,HA_CREATE_INFO * create_info)646 int ha_partition::create_handler_files(const char *path,
647                                        const char *old_path,
648                                        int action_flag,
649                                        HA_CREATE_INFO *create_info)
650 {
651   DBUG_ENTER("ha_partition::create_handler_files()");
652 
653   /*
654     We need to update total number of parts since we might write the handler
655     file as part of a partition management command
656   */
657   if (action_flag == CHF_DELETE_FLAG ||
658       action_flag == CHF_RENAME_FLAG)
659   {
660     char name[FN_REFLEN];
661     char old_name[FN_REFLEN];
662 
663     strxmov(name, path, ha_par_ext, NullS);
664     strxmov(old_name, old_path, ha_par_ext, NullS);
665     if ((action_flag == CHF_DELETE_FLAG &&
666          mysql_file_delete(key_file_partition, name, MYF(MY_WME))) ||
667         (action_flag == CHF_RENAME_FLAG &&
668          mysql_file_rename(key_file_partition, old_name, name, MYF(MY_WME))))
669     {
670       DBUG_RETURN(TRUE);
671     }
672   }
673   else if (action_flag == CHF_CREATE_FLAG)
674   {
675     if (create_handler_file(path))
676     {
677       my_error(ER_CANT_CREATE_HANDLER_FILE, MYF(0));
678       DBUG_RETURN(1);
679     }
680   }
681   DBUG_RETURN(0);
682 }
683 
684 
685 /*
686   Create a partitioned table
687 
688   SYNOPSIS
689     create()
690     name                              Full path of table name
691     table_arg                         Table object
692     create_info                       Create info generated for CREATE TABLE
693 
694   RETURN VALUE
695     >0                        Error
696     0                         Success
697 
698   DESCRIPTION
699     create() is called to create a table. The variable name will have the name
700     of the table. When create() is called you do not need to worry about
701     opening the table. Also, the FRM file will have already been created so
702     adjusting create_info will not do you any good. You can overwrite the frm
703     file at this point if you wish to change the table definition, but there
704     are no methods currently provided for doing that.
705 
706     Called from handler.cc by ha_create_table().
707 */
708 
create(const char * name,TABLE * table_arg,HA_CREATE_INFO * create_info)709 int ha_partition::create(const char *name, TABLE *table_arg,
710 			 HA_CREATE_INFO *create_info)
711 {
712   int error= 0;
713   char name_buff[FN_REFLEN + 1], name_lc_buff[FN_REFLEN + 1];
714   char *name_buffer_ptr;
715   const char *path;
716   uint i;
717   List_iterator_fast <partition_element> part_it(m_part_info->partitions);
718   partition_element *part_elem;
719   handler **file, **abort_file;
720   DBUG_ENTER("ha_partition::create");
721 
722   DBUG_ASSERT(*fn_rext((char*)name) == '\0');
723 
724   /* Not allowed to create temporary partitioned tables */
725   if (create_info && create_info->options & HA_LEX_CREATE_TMP_TABLE)
726   {
727     my_error(ER_PARTITION_NO_TEMPORARY, MYF(0));
728     DBUG_RETURN(TRUE);
729   }
730 
731   if (get_from_handler_file(name, ha_thd()->mem_root, false))
732     DBUG_RETURN(TRUE);
733   DBUG_ASSERT(m_file_buffer);
734   DBUG_PRINT("enter", ("name: (%s)", name));
735   name_buffer_ptr= m_name_buffer_ptr;
736   file= m_file;
737   /*
738     Since ha_partition has HA_FILE_BASED, it must alter underlying table names
739     if they do not have HA_FILE_BASED and lower_case_table_names == 2.
740     See Bug#37402, for Mac OS X.
741     The appended #P#<partname>[#SP#<subpartname>] will remain in current case.
742     Using the first partitions handler, since mixing handlers is not allowed.
743   */
744   path= get_canonical_filename(*file, name, name_lc_buff);
745   for (i= 0; i < m_part_info->num_parts; i++)
746   {
747     part_elem= part_it++;
748     if (m_is_sub_partitioned)
749     {
750       uint j;
751       List_iterator_fast <partition_element> sub_it(part_elem->subpartitions);
752       for (j= 0; j < m_part_info->num_subparts; j++)
753       {
754         part_elem= sub_it++;
755         if ((error= create_partition_name(name_buff, path, name_buffer_ptr,
756                                           NORMAL_PART_NAME, FALSE)))
757           goto create_error;
758 
759         if ((error= set_up_table_before_create(table_arg, name_buff,
760                                                create_info, part_elem)) ||
761             ((error= (*file)->ha_create(name_buff, table_arg, create_info))))
762           goto create_error;
763 
764         name_buffer_ptr= strend(name_buffer_ptr) + 1;
765         file++;
766       }
767     }
768     else
769     {
770       if ((create_partition_name(name_buff, path, name_buffer_ptr,
771                                  NORMAL_PART_NAME, FALSE)))
772         goto create_error;
773 
774       if ((error= set_up_table_before_create(table_arg, name_buff,
775                                              create_info, part_elem)) ||
776           ((error= (*file)->ha_create(name_buff, table_arg, create_info))))
777         goto create_error;
778 
779       name_buffer_ptr= strend(name_buffer_ptr) + 1;
780       file++;
781     }
782   }
783   DBUG_RETURN(0);
784 
785 create_error:
786   name_buffer_ptr= m_name_buffer_ptr;
787   for (abort_file= file, file= m_file; file < abort_file; file++)
788   {
789     if (!create_partition_name(name_buff, path, name_buffer_ptr, NORMAL_PART_NAME,
790                                FALSE))
791       (void) (*file)->ha_delete_table((const char*) name_buff);
792     name_buffer_ptr= strend(name_buffer_ptr) + 1;
793   }
794   handler::delete_table(name);
795   DBUG_RETURN(error);
796 }
797 
798 
799 /*
800   Drop partitions as part of ALTER TABLE of partitions
801 
802   SYNOPSIS
803     drop_partitions()
804     path                        Complete path of db and table name
805 
806   RETURN VALUE
807     >0                          Failure
808     0                           Success
809 
810   DESCRIPTION
811     Use part_info object on handler object to deduce which partitions to
812     drop (each partition has a state attached to it)
813 */
814 
drop_partitions(const char * path)815 int ha_partition::drop_partitions(const char *path)
816 {
817   List_iterator<partition_element> part_it(m_part_info->partitions);
818   char part_name_buff[FN_REFLEN + 1];
819   uint num_parts= m_part_info->partitions.elements;
820   uint num_subparts= m_part_info->num_subparts;
821   uint i= 0;
822   uint name_variant;
823   int  ret_error;
824   int  error= 0;
825   DBUG_ENTER("ha_partition::drop_partitions");
826 
827   /*
828     Assert that it works without HA_FILE_BASED and lower_case_table_name = 2.
829     We use m_file[0] as long as all partitions have the same storage engine.
830   */
831   DBUG_ASSERT(!strcmp(path, get_canonical_filename(m_file[0], path,
832                                                    part_name_buff)));
833   do
834   {
835     partition_element *part_elem= part_it++;
836     if (part_elem->part_state == PART_TO_BE_DROPPED)
837     {
838       handler *file;
839       /*
840         This part is to be dropped, meaning the part or all its subparts.
841       */
842       name_variant= NORMAL_PART_NAME;
843       if (m_is_sub_partitioned)
844       {
845         List_iterator<partition_element> sub_it(part_elem->subpartitions);
846         uint j= 0, part;
847         do
848         {
849           partition_element *sub_elem= sub_it++;
850           part= i * num_subparts + j;
851           if ((ret_error= create_subpartition_name(part_name_buff, path,
852                                                    part_elem->partition_name,
853                                                    sub_elem->partition_name,
854                                                    name_variant)))
855             error= ret_error;
856 
857           file= m_file[part];
858           DBUG_PRINT("info", ("Drop subpartition %s", part_name_buff));
859           if ((ret_error= file->ha_delete_table(part_name_buff)))
860             error= ret_error;
861           if (deactivate_ddl_log_entry(sub_elem->log_entry->entry_pos))
862             error= 1;
863         } while (++j < num_subparts);
864       }
865       else
866       {
867         if ((ret_error= create_partition_name(part_name_buff, path,
868                                               part_elem->partition_name,
869                                               name_variant, TRUE)))
870           error= ret_error;
871 
872         file= m_file[i];
873         DBUG_PRINT("info", ("Drop partition %s", part_name_buff));
874         if ((ret_error= file->ha_delete_table(part_name_buff)))
875           error= ret_error;
876         if (deactivate_ddl_log_entry(part_elem->log_entry->entry_pos))
877           error= 1;
878       }
879       if (part_elem->part_state == PART_IS_CHANGED)
880         part_elem->part_state= PART_NORMAL;
881       else
882         part_elem->part_state= PART_IS_DROPPED;
883     }
884   } while (++i < num_parts);
885   (void) sync_ddl_log();
886   DBUG_RETURN(error);
887 }
888 
889 
890 /*
891   Rename partitions as part of ALTER TABLE of partitions
892 
893   SYNOPSIS
894     rename_partitions()
895     path                        Complete path of db and table name
896 
897   RETURN VALUE
898     TRUE                        Failure
899     FALSE                       Success
900 
901   DESCRIPTION
902     When reorganising partitions, adding hash partitions and coalescing
903     partitions it can be necessary to rename partitions while holding
904     an exclusive lock on the table.
905     Which partitions to rename is given by state of partitions found by the
906     partition info struct referenced from the handler object
907 */
908 
rename_partitions(const char * path)909 int ha_partition::rename_partitions(const char *path)
910 {
911   List_iterator<partition_element> part_it(m_part_info->partitions);
912   List_iterator<partition_element> temp_it(m_part_info->temp_partitions);
913   char part_name_buff[FN_REFLEN];
914   char norm_name_buff[FN_REFLEN];
915   uint num_parts= m_part_info->partitions.elements;
916   uint part_count= 0;
917   uint num_subparts= m_part_info->num_subparts;
918   uint i= 0;
919   uint j= 0;
920   int error= 0;
921   int ret_error;
922   uint temp_partitions= m_part_info->temp_partitions.elements;
923   handler *file;
924   partition_element *part_elem, *sub_elem;
925   DBUG_ENTER("ha_partition::rename_partitions");
926 
927   /*
928     Assert that it works without HA_FILE_BASED and lower_case_table_name = 2.
929     We use m_file[0] as long as all partitions have the same storage engine.
930   */
931   DBUG_ASSERT(!strcmp(path, get_canonical_filename(m_file[0], path,
932                                                    norm_name_buff)));
933 
934   DEBUG_SYNC(ha_thd(), "before_rename_partitions");
935   if (temp_partitions)
936   {
937     /*
938       These are the reorganised partitions that have already been copied.
939       We delete the partitions and log the delete by inactivating the
940       delete log entry in the table log. We only need to synchronise
941       these writes before moving to the next loop since there is no
942       interaction among reorganised partitions, they cannot have the
943       same name.
944     */
945     do
946     {
947       part_elem= temp_it++;
948       if (m_is_sub_partitioned)
949       {
950         List_iterator<partition_element> sub_it(part_elem->subpartitions);
951         j= 0;
952         do
953         {
954           sub_elem= sub_it++;
955           file= m_reorged_file[part_count++];
956           if ((ret_error= create_subpartition_name(norm_name_buff, path,
957                                                    part_elem->partition_name,
958                                                    sub_elem->partition_name,
959                                                    NORMAL_PART_NAME)))
960            error= ret_error;
961 
962           DBUG_PRINT("info", ("Delete subpartition %s", norm_name_buff));
963           if ((ret_error= file->ha_delete_table(norm_name_buff)))
964             error= ret_error;
965           else if (deactivate_ddl_log_entry(sub_elem->log_entry->entry_pos))
966             error= 1;
967           else
968             sub_elem->log_entry= NULL; /* Indicate success */
969         } while (++j < num_subparts);
970       }
971       else
972       {
973         file= m_reorged_file[part_count++];
974         if ((ret_error= create_partition_name(norm_name_buff, path,
975                                               part_elem->partition_name,
976                                               NORMAL_PART_NAME, TRUE)))
977           error= ret_error;
978 
979         DBUG_PRINT("info", ("Delete partition %s", norm_name_buff));
980         if ((ret_error= file->ha_delete_table(norm_name_buff)))
981           error= ret_error;
982         else if (deactivate_ddl_log_entry(part_elem->log_entry->entry_pos))
983           error= 1;
984         else
985           part_elem->log_entry= NULL; /* Indicate success */
986       }
987     } while (++i < temp_partitions);
988     (void) sync_ddl_log();
989   }
990   i= 0;
991   do
992   {
993     /*
994        When state is PART_IS_CHANGED it means that we have created a new
995        TEMP partition that is to be renamed to normal partition name and
996        we are to delete the old partition with currently the normal name.
997 
998        We perform this operation by
999        1) Delete old partition with normal partition name
1000        2) Signal this in table log entry
1001        3) Synch table log to ensure we have consistency in crashes
1002        4) Rename temporary partition name to normal partition name
1003        5) Signal this to table log entry
1004        It is not necessary to synch the last state since a new rename
1005        should not corrupt things if there was no temporary partition.
1006 
1007        The only other parts we need to cater for are new parts that
1008        replace reorganised parts. The reorganised parts were deleted
1009        by the code above that goes through the temp_partitions list.
1010        Thus the synch above makes it safe to simply perform step 4 and 5
1011        for those entries.
1012     */
1013     part_elem= part_it++;
1014     if (part_elem->part_state == PART_IS_CHANGED ||
1015         part_elem->part_state == PART_TO_BE_DROPPED ||
1016         (part_elem->part_state == PART_IS_ADDED && temp_partitions))
1017     {
1018       if (m_is_sub_partitioned)
1019       {
1020         List_iterator<partition_element> sub_it(part_elem->subpartitions);
1021         uint part;
1022 
1023         j= 0;
1024         do
1025         {
1026           sub_elem= sub_it++;
1027           part= i * num_subparts + j;
1028           if ((ret_error= create_subpartition_name(norm_name_buff, path,
1029                                                    part_elem->partition_name,
1030                                                    sub_elem->partition_name,
1031                                                    NORMAL_PART_NAME)))
1032             error= ret_error;
1033 
1034           if (part_elem->part_state == PART_IS_CHANGED)
1035           {
1036             file= m_reorged_file[part_count++];
1037             DBUG_PRINT("info", ("Delete subpartition %s", norm_name_buff));
1038             if ((ret_error= file->ha_delete_table(norm_name_buff)))
1039               error= ret_error;
1040             else if (deactivate_ddl_log_entry(sub_elem->log_entry->entry_pos))
1041               error= 1;
1042             (void) sync_ddl_log();
1043           }
1044           file= m_new_file[part];
1045           if ((ret_error= create_subpartition_name(part_name_buff, path,
1046                                                    part_elem->partition_name,
1047                                                    sub_elem->partition_name,
1048                                                    TEMP_PART_NAME)))
1049             error= ret_error;
1050 
1051           DBUG_PRINT("info", ("Rename subpartition from %s to %s",
1052                      part_name_buff, norm_name_buff));
1053           if ((ret_error= file->ha_rename_table(part_name_buff,
1054                                                 norm_name_buff)))
1055             error= ret_error;
1056           else if (deactivate_ddl_log_entry(sub_elem->log_entry->entry_pos))
1057             error= 1;
1058           else
1059             sub_elem->log_entry= NULL;
1060         } while (++j < num_subparts);
1061       }
1062       else
1063       {
1064         if ((ret_error= create_partition_name(norm_name_buff, path,
1065                                               part_elem->partition_name,
1066                                               NORMAL_PART_NAME, TRUE)))
1067           error= ret_error;
1068 
1069         if (part_elem->part_state == PART_IS_CHANGED)
1070         {
1071           file= m_reorged_file[part_count++];
1072           DBUG_PRINT("info", ("Delete partition %s", norm_name_buff));
1073           if ((ret_error= file->ha_delete_table(norm_name_buff)))
1074             error= ret_error;
1075           else if (deactivate_ddl_log_entry(part_elem->log_entry->entry_pos))
1076             error= 1;
1077           (void) sync_ddl_log();
1078         }
1079         file= m_new_file[i];
1080         if ((error= create_partition_name(part_name_buff, path,
1081                                           part_elem->partition_name,
1082                                           TEMP_PART_NAME, TRUE)))
1083           error= ret_error;
1084         DBUG_PRINT("info", ("Rename partition from %s to %s",
1085                    part_name_buff, norm_name_buff));
1086         if ((ret_error= file->ha_rename_table(part_name_buff,
1087                                               norm_name_buff)))
1088           error= ret_error;
1089         else if (deactivate_ddl_log_entry(part_elem->log_entry->entry_pos))
1090           error= 1;
1091         else
1092           part_elem->log_entry= NULL;
1093       }
1094     }
1095   } while (++i < num_parts);
1096   (void) sync_ddl_log();
1097   DBUG_RETURN(error);
1098 }
1099 
1100 
1101 #define OPTIMIZE_PARTS 1
1102 #define ANALYZE_PARTS 2
1103 #define CHECK_PARTS   3
1104 #define REPAIR_PARTS 4
1105 #define ASSIGN_KEYCACHE_PARTS 5
1106 #define PRELOAD_KEYS_PARTS 6
1107 
1108 static const char *opt_op_name[]= {NULL,
1109                                    "optimize", "analyze", "check", "repair",
1110                                    "assign_to_keycache", "preload_keys"};
1111 
1112 /*
1113   Optimize table
1114 
1115   SYNOPSIS
1116     optimize()
1117     thd               Thread object
1118     check_opt         Check/analyze/repair/optimize options
1119 
1120   RETURN VALUES
1121     >0                Error
1122     0                 Success
1123 */
1124 
optimize(THD * thd,HA_CHECK_OPT * check_opt)1125 int ha_partition::optimize(THD *thd, HA_CHECK_OPT *check_opt)
1126 {
1127   DBUG_ENTER("ha_partition::optimize");
1128 
1129   DBUG_RETURN(handle_opt_partitions(thd, check_opt, OPTIMIZE_PARTS));
1130 }
1131 
1132 
1133 /*
1134   Analyze table
1135 
1136   SYNOPSIS
1137     analyze()
1138     thd               Thread object
1139     check_opt         Check/analyze/repair/optimize options
1140 
1141   RETURN VALUES
1142     >0                Error
1143     0                 Success
1144 */
1145 
analyze(THD * thd,HA_CHECK_OPT * check_opt)1146 int ha_partition::analyze(THD *thd, HA_CHECK_OPT *check_opt)
1147 {
1148   DBUG_ENTER("ha_partition::analyze");
1149 
1150   DBUG_RETURN(handle_opt_partitions(thd, check_opt, ANALYZE_PARTS));
1151 }
1152 
1153 
1154 /*
1155   Check table
1156 
1157   SYNOPSIS
1158     check()
1159     thd               Thread object
1160     check_opt         Check/analyze/repair/optimize options
1161 
1162   RETURN VALUES
1163     >0                Error
1164     0                 Success
1165 */
1166 
check(THD * thd,HA_CHECK_OPT * check_opt)1167 int ha_partition::check(THD *thd, HA_CHECK_OPT *check_opt)
1168 {
1169   DBUG_ENTER("ha_partition::check");
1170 
1171   DBUG_RETURN(handle_opt_partitions(thd, check_opt, CHECK_PARTS));
1172 }
1173 
1174 
1175 /*
1176   Repair table
1177 
1178   SYNOPSIS
1179     repair()
1180     thd               Thread object
1181     check_opt         Check/analyze/repair/optimize options
1182 
1183   RETURN VALUES
1184     >0                Error
1185     0                 Success
1186 */
1187 
repair(THD * thd,HA_CHECK_OPT * check_opt)1188 int ha_partition::repair(THD *thd, HA_CHECK_OPT *check_opt)
1189 {
1190   DBUG_ENTER("ha_partition::repair");
1191 
1192   DBUG_RETURN(handle_opt_partitions(thd, check_opt, REPAIR_PARTS));
1193 }
1194 
1195 /**
1196   Assign to keycache
1197 
1198   @param thd          Thread object
1199   @param check_opt    Check/analyze/repair/optimize options
1200 
1201   @return
1202     @retval >0        Error
1203     @retval 0         Success
1204 */
1205 
assign_to_keycache(THD * thd,HA_CHECK_OPT * check_opt)1206 int ha_partition::assign_to_keycache(THD *thd, HA_CHECK_OPT *check_opt)
1207 {
1208   DBUG_ENTER("ha_partition::assign_to_keycache");
1209 
1210   DBUG_RETURN(handle_opt_partitions(thd, check_opt, ASSIGN_KEYCACHE_PARTS));
1211 }
1212 
1213 
1214 /**
1215   Preload to keycache
1216 
1217   @param thd          Thread object
1218   @param check_opt    Check/analyze/repair/optimize options
1219 
1220   @return
1221     @retval >0        Error
1222     @retval 0         Success
1223 */
1224 
preload_keys(THD * thd,HA_CHECK_OPT * check_opt)1225 int ha_partition::preload_keys(THD *thd, HA_CHECK_OPT *check_opt)
1226 {
1227   DBUG_ENTER("ha_partition::preload_keys");
1228 
1229   DBUG_RETURN(handle_opt_partitions(thd, check_opt, PRELOAD_KEYS_PARTS));
1230 }
1231 
1232 
1233 /*
1234   Handle optimize/analyze/check/repair of one partition
1235 
1236   SYNOPSIS
1237     handle_opt_part()
1238     thd                      Thread object
1239     check_opt                Options
1240     file                     Handler object of partition
1241     flag                     Optimize/Analyze/Check/Repair flag
1242 
1243   RETURN VALUE
1244     >0                        Failure
1245     0                         Success
1246 */
1247 
handle_opt_part(THD * thd,HA_CHECK_OPT * check_opt,uint part_id,uint flag)1248 int ha_partition::handle_opt_part(THD *thd, HA_CHECK_OPT *check_opt,
1249                                   uint part_id, uint flag)
1250 {
1251   int error;
1252   handler *file= m_file[part_id];
1253   DBUG_ENTER("handle_opt_part");
1254   DBUG_PRINT("enter", ("flag = %u", flag));
1255 
1256   if (flag == OPTIMIZE_PARTS)
1257     error= file->ha_optimize(thd, check_opt);
1258   else if (flag == ANALYZE_PARTS)
1259     error= file->ha_analyze(thd, check_opt);
1260   else if (flag == CHECK_PARTS)
1261   {
1262     error= file->ha_check(thd, check_opt);
1263     if (!error ||
1264         error == HA_ADMIN_ALREADY_DONE ||
1265         error == HA_ADMIN_NOT_IMPLEMENTED)
1266     {
1267       if (check_opt->flags & (T_MEDIUM | T_EXTEND))
1268         error= check_misplaced_rows(part_id, false);
1269     }
1270   }
1271   else if (flag == REPAIR_PARTS)
1272   {
1273     error= file->ha_repair(thd, check_opt);
1274     if (!error ||
1275         error == HA_ADMIN_ALREADY_DONE ||
1276         error == HA_ADMIN_NOT_IMPLEMENTED)
1277     {
1278       if (check_opt->flags & (T_MEDIUM | T_EXTEND))
1279         error= check_misplaced_rows(part_id, true);
1280     }
1281   }
1282   else if (flag == ASSIGN_KEYCACHE_PARTS)
1283     error= file->assign_to_keycache(thd, check_opt);
1284   else if (flag == PRELOAD_KEYS_PARTS)
1285     error= file->preload_keys(thd, check_opt);
1286   else
1287   {
1288     DBUG_ASSERT(FALSE);
1289     error= 1;
1290   }
1291   if (error == HA_ADMIN_ALREADY_DONE)
1292     error= 0;
1293   DBUG_RETURN(error);
1294 }
1295 
1296 
1297 /*
1298    print a message row formatted for ANALYZE/CHECK/OPTIMIZE/REPAIR TABLE
1299    (modelled after mi_check_print_msg)
1300    TODO: move this into the handler, or rewrite mysql_admin_table.
1301 */
1302 static bool print_admin_msg(THD* thd, uint len,
1303                             const char* msg_type,
1304                             const char* db_name, const char* table_name,
1305                             const char* op_name, const char *fmt, ...)
1306   ATTRIBUTE_FORMAT(printf, 7, 8);
print_admin_msg(THD * thd,uint len,const char * msg_type,const char * db_name,const char * table_name,const char * op_name,const char * fmt,...)1307 static bool print_admin_msg(THD* thd, uint len,
1308                             const char* msg_type,
1309                             const char* db_name, const char* table_name,
1310                             const char* op_name, const char *fmt, ...)
1311 {
1312   va_list args;
1313   Protocol *protocol= thd->protocol;
1314   uint length;
1315   uint msg_length;
1316   char name[NAME_LEN*2+2];
1317   char *msgbuf;
1318   bool error= true;
1319 
1320   if (!(msgbuf= (char*) my_malloc(len, MYF(0))))
1321     return true;
1322   va_start(args, fmt);
1323   msg_length= my_vsnprintf(msgbuf, len, fmt, args);
1324   va_end(args);
1325   if (msg_length >= (len - 1))
1326     goto err;
1327   msgbuf[len - 1] = 0; // healthy paranoia
1328 
1329 
1330   if (!thd->vio_ok())
1331   {
1332     sql_print_error("%s", msgbuf);
1333     goto err;
1334   }
1335 
1336   length=(uint) (strxmov(name, db_name, ".", table_name,NullS) - name);
1337   /*
1338      TODO: switch from protocol to push_warning here. The main reason we didn't
1339      it yet is parallel repair. Due to following trace:
1340      mi_check_print_msg/push_warning/sql_alloc/my_pthread_getspecific_ptr.
1341 
1342      Also we likely need to lock mutex here (in both cases with protocol and
1343      push_warning).
1344   */
1345   DBUG_PRINT("info",("print_admin_msg:  %s, %s, %s, %s", name, op_name,
1346                      msg_type, msgbuf));
1347   protocol->prepare_for_resend();
1348   protocol->store(name, length, system_charset_info);
1349   protocol->store(op_name, system_charset_info);
1350   protocol->store(msg_type, system_charset_info);
1351   protocol->store(msgbuf, msg_length, system_charset_info);
1352   if (protocol->write())
1353   {
1354     sql_print_error("Failed on my_net_write, writing to stderr instead: %s\n",
1355                     msgbuf);
1356     goto err;
1357   }
1358   error= false;
1359 err:
1360   my_free(msgbuf);
1361   return error;
1362 }
1363 
1364 
1365 /*
1366   Handle optimize/analyze/check/repair of partitions
1367 
1368   SYNOPSIS
1369     handle_opt_partitions()
1370     thd                      Thread object
1371     check_opt                Options
1372     flag                     Optimize/Analyze/Check/Repair flag
1373 
1374   RETURN VALUE
1375     >0                        Failure
1376     0                         Success
1377 */
1378 
handle_opt_partitions(THD * thd,HA_CHECK_OPT * check_opt,uint flag)1379 int ha_partition::handle_opt_partitions(THD *thd, HA_CHECK_OPT *check_opt,
1380                                         uint flag)
1381 {
1382   List_iterator<partition_element> part_it(m_part_info->partitions);
1383   uint num_parts= m_part_info->num_parts;
1384   uint num_subparts= m_part_info->num_subparts;
1385   uint i= 0;
1386   int error;
1387   DBUG_ENTER("ha_partition::handle_opt_partitions");
1388   DBUG_PRINT("enter", ("flag= %u", flag));
1389 
1390   do
1391   {
1392     partition_element *part_elem= part_it++;
1393     /*
1394       when ALTER TABLE <CMD> PARTITION ...
1395       it should only do named partitions, otherwise all partitions
1396     */
1397     if (!(thd->lex->alter_info.flags & Alter_info::ALTER_ADMIN_PARTITION) ||
1398         part_elem->part_state == PART_ADMIN)
1399     {
1400       if (m_is_sub_partitioned)
1401       {
1402         List_iterator<partition_element> subpart_it(part_elem->subpartitions);
1403         partition_element *sub_elem;
1404         uint j= 0, part;
1405         do
1406         {
1407           sub_elem= subpart_it++;
1408           part= i * num_subparts + j;
1409           DBUG_PRINT("info", ("Optimize subpartition %u (%s)",
1410                      part, sub_elem->partition_name));
1411           if ((error= handle_opt_part(thd, check_opt, part, flag)))
1412           {
1413             /* print a line which partition the error belongs to */
1414             if (error != HA_ADMIN_NOT_IMPLEMENTED &&
1415                 error != HA_ADMIN_ALREADY_DONE &&
1416                 error != HA_ADMIN_TRY_ALTER)
1417             {
1418 	      print_admin_msg(thd, MI_MAX_MSG_BUF, "error",
1419                               table_share->db.str, table->alias,
1420                               opt_op_name[flag],
1421                               "Subpartition %s returned error",
1422                               sub_elem->partition_name);
1423             }
1424             /* reset part_state for the remaining partitions */
1425             do
1426             {
1427               if (part_elem->part_state == PART_ADMIN)
1428                 part_elem->part_state= PART_NORMAL;
1429             } while ((part_elem= part_it++));
1430             DBUG_RETURN(error);
1431           }
1432         } while (++j < num_subparts);
1433       }
1434       else
1435       {
1436         DBUG_PRINT("info", ("Optimize partition %u (%s)", i,
1437                             part_elem->partition_name));
1438         if ((error= handle_opt_part(thd, check_opt, i, flag)))
1439         {
1440           /* print a line which partition the error belongs to */
1441           if (error != HA_ADMIN_NOT_IMPLEMENTED &&
1442               error != HA_ADMIN_ALREADY_DONE &&
1443               error != HA_ADMIN_TRY_ALTER)
1444           {
1445 	    print_admin_msg(thd, MI_MAX_MSG_BUF, "error",
1446                             table_share->db.str, table->alias,
1447                             opt_op_name[flag], "Partition %s returned error",
1448                             part_elem->partition_name);
1449           }
1450           /* reset part_state for the remaining partitions */
1451           do
1452           {
1453             if (part_elem->part_state == PART_ADMIN)
1454               part_elem->part_state= PART_NORMAL;
1455           } while ((part_elem= part_it++));
1456           DBUG_RETURN(error);
1457         }
1458       }
1459       part_elem->part_state= PART_NORMAL;
1460     }
1461   } while (++i < num_parts);
1462   DBUG_RETURN(FALSE);
1463 }
1464 
1465 
1466 /**
1467   @brief Check and repair the table if neccesary
1468 
1469   @param thd    Thread object
1470 
1471   @retval TRUE  Error/Not supported
1472   @retval FALSE Success
1473 
1474   @note Called if open_table_from_share fails and ::is_crashed().
1475 */
1476 
check_and_repair(THD * thd)1477 bool ha_partition::check_and_repair(THD *thd)
1478 {
1479   handler **file= m_file;
1480   DBUG_ENTER("ha_partition::check_and_repair");
1481 
1482   do
1483   {
1484     if ((*file)->ha_check_and_repair(thd))
1485       DBUG_RETURN(TRUE);
1486   } while (*(++file));
1487   DBUG_RETURN(FALSE);
1488 }
1489 
1490 
1491 /**
1492   @breif Check if the table can be automatically repaired
1493 
1494   @retval TRUE  Can be auto repaired
1495   @retval FALSE Cannot be auto repaired
1496 */
1497 
auto_repair() const1498 bool ha_partition::auto_repair() const
1499 {
1500   DBUG_ENTER("ha_partition::auto_repair");
1501 
1502   /*
1503     As long as we only support one storage engine per table,
1504     we can use the first partition for this function.
1505   */
1506   DBUG_RETURN(m_file[0]->auto_repair());
1507 }
1508 
1509 
1510 /**
1511   @breif Check if the table is crashed
1512 
1513   @retval TRUE  Crashed
1514   @retval FALSE Not crashed
1515 */
1516 
is_crashed() const1517 bool ha_partition::is_crashed() const
1518 {
1519   handler **file= m_file;
1520   DBUG_ENTER("ha_partition::is_crashed");
1521 
1522   do
1523   {
1524     if ((*file)->is_crashed())
1525       DBUG_RETURN(TRUE);
1526   } while (*(++file));
1527   DBUG_RETURN(FALSE);
1528 }
1529 
1530 
1531 /*
1532   Prepare by creating a new partition
1533 
1534   SYNOPSIS
1535     prepare_new_partition()
1536     table                      Table object
1537     create_info                Create info from CREATE TABLE
1538     file                       Handler object of new partition
1539     part_name                  partition name
1540 
1541   RETURN VALUE
1542     >0                         Error
1543     0                          Success
1544 */
1545 
prepare_new_partition(TABLE * tbl,HA_CREATE_INFO * create_info,handler * file,const char * part_name,partition_element * p_elem,uint disable_non_uniq_indexes)1546 int ha_partition::prepare_new_partition(TABLE *tbl,
1547                                         HA_CREATE_INFO *create_info,
1548                                         handler *file, const char *part_name,
1549                                         partition_element *p_elem,
1550                                         uint disable_non_uniq_indexes)
1551 {
1552   int error;
1553   DBUG_ENTER("prepare_new_partition");
1554 
1555   /*
1556     This call to set_up_table_before_create() is done for an alter table.
1557     So this may be the second time around for this partition_element,
1558     depending on how many partitions and subpartitions there were before,
1559     and how many there are now.
1560     The first time, on the CREATE, data_file_name and index_file_name
1561     came from the parser.  They did not have the file name attached to
1562     the end.  But if this partition is less than the total number of
1563     previous partitions, it's data_file_name has the filename attached.
1564     So we need to take the partition filename off if it exists.
1565     That file name may be different from part_name, which will be
1566     attached in append_file_to_dir().
1567   */
1568   truncate_partition_filename(p_elem->data_file_name);
1569   truncate_partition_filename(p_elem->index_file_name);
1570 
1571   if ((error= set_up_table_before_create(tbl, part_name, create_info, p_elem)))
1572     goto error_create;
1573 
1574   if ((error= file->ha_create(part_name, tbl, create_info)))
1575   {
1576     /*
1577       Added for safety, InnoDB reports HA_ERR_FOUND_DUPP_KEY
1578       if the table/partition already exists.
1579       If we return that error code, then print_error would try to
1580       get_dup_key on a non-existing partition.
1581       So return a more reasonable error code.
1582     */
1583     if (error == HA_ERR_FOUND_DUPP_KEY)
1584       error= HA_ERR_TABLE_EXIST;
1585     goto error_create;
1586   }
1587   DBUG_PRINT("info", ("partition %s created", part_name));
1588   if ((error= file->ha_open(tbl, part_name, m_mode,
1589                             m_open_test_lock | HA_OPEN_NO_PSI_CALL)))
1590     goto error_open;
1591   DBUG_PRINT("info", ("partition %s opened", part_name));
1592 
1593   /*
1594     Note: if you plan to add another call that may return failure,
1595     better to do it before external_lock() as cleanup_new_partition()
1596     assumes that external_lock() is last call that may fail here.
1597     Otherwise see description for cleanup_new_partition().
1598   */
1599   if ((error= file->ha_external_lock(ha_thd(), F_WRLCK)))
1600     goto error_external_lock;
1601   DBUG_PRINT("info", ("partition %s external locked", part_name));
1602 
1603   if (disable_non_uniq_indexes)
1604     file->ha_disable_indexes(HA_KEY_SWITCH_NONUNIQ_SAVE);
1605 
1606   DBUG_RETURN(0);
1607 error_external_lock:
1608   (void) file->ha_close();
1609 error_open:
1610   (void) file->ha_delete_table(part_name);
1611 error_create:
1612   DBUG_RETURN(error);
1613 }
1614 
1615 
1616 /*
1617   Cleanup by removing all created partitions after error
1618 
1619   SYNOPSIS
1620     cleanup_new_partition()
1621     part_count             Number of partitions to remove
1622 
1623   RETURN VALUE
1624     NONE
1625 
1626   DESCRIPTION
1627     This function is called immediately after prepare_new_partition() in
1628     case the latter fails.
1629 
1630     In prepare_new_partition() last call that may return failure is
1631     external_lock(). That means if prepare_new_partition() fails,
1632     partition does not have external lock. Thus no need to call
1633     external_lock(F_UNLCK) here.
1634 
1635   TODO:
1636     We must ensure that in the case that we get an error during the process
1637     that we call external_lock with F_UNLCK, close the table and delete the
1638     table in the case where we have been successful with prepare_handler.
1639     We solve this by keeping an array of successful calls to prepare_handler
1640     which can then be used to undo the call.
1641 */
1642 
cleanup_new_partition(uint part_count)1643 void ha_partition::cleanup_new_partition(uint part_count)
1644 {
1645   DBUG_ENTER("ha_partition::cleanup_new_partition");
1646 
1647   if (m_added_file)
1648   {
1649     THD *thd= ha_thd();
1650     handler **file= m_added_file;
1651     while ((part_count > 0) && (*file))
1652     {
1653       (*file)->ha_external_lock(thd, F_UNLCK);
1654       (*file)->ha_close();
1655 
1656       /* Leave the (*file)->ha_delete_table(part_name) to the ddl-log */
1657 
1658       file++;
1659       part_count--;
1660     }
1661     m_added_file= NULL;
1662   }
1663   DBUG_VOID_RETURN;
1664 }
1665 
1666 /*
1667   Implement the partition changes defined by ALTER TABLE of partitions
1668 
1669   SYNOPSIS
1670     change_partitions()
1671     create_info                 HA_CREATE_INFO object describing all
1672                                 fields and indexes in table
1673     path                        Complete path of db and table name
1674     out: copied                 Output parameter where number of copied
1675                                 records are added
1676     out: deleted                Output parameter where number of deleted
1677                                 records are added
1678     pack_frm_data               Reference to packed frm file
1679     pack_frm_len                Length of packed frm file
1680 
1681   RETURN VALUE
1682     >0                        Failure
1683     0                         Success
1684 
1685   DESCRIPTION
1686     Add and copy if needed a number of partitions, during this operation
1687     no other operation is ongoing in the server. This is used by
1688     ADD PARTITION all types as well as by REORGANIZE PARTITION. For
1689     one-phased implementations it is used also by DROP and COALESCE
1690     PARTITIONs.
1691     One-phased implementation needs the new frm file, other handlers will
1692     get zero length and a NULL reference here.
1693 */
1694 
change_partitions(HA_CREATE_INFO * create_info,const char * path,ulonglong * const copied,ulonglong * const deleted,const uchar * pack_frm_data MY_ATTRIBUTE ((unused)),size_t pack_frm_len MY_ATTRIBUTE ((unused)))1695 int ha_partition::change_partitions(HA_CREATE_INFO *create_info,
1696                                     const char *path,
1697                                     ulonglong * const copied,
1698                                     ulonglong * const deleted,
1699                                     const uchar *pack_frm_data
1700                                     MY_ATTRIBUTE((unused)),
1701                                     size_t pack_frm_len
1702                                     MY_ATTRIBUTE((unused)))
1703 {
1704   List_iterator<partition_element> part_it(m_part_info->partitions);
1705   List_iterator <partition_element> t_it(m_part_info->temp_partitions);
1706   char part_name_buff[FN_REFLEN + 1];
1707   uint num_parts= m_part_info->partitions.elements;
1708   uint num_subparts= m_part_info->num_subparts;
1709   uint i= 0;
1710   uint num_remain_partitions, part_count, orig_count;
1711   handler **new_file_array;
1712   int error= 1;
1713   bool first;
1714   uint temp_partitions= m_part_info->temp_partitions.elements;
1715   THD *thd= ha_thd();
1716   DBUG_ENTER("ha_partition::change_partitions");
1717 
1718   /*
1719     Assert that it works without HA_FILE_BASED and lower_case_table_name = 2.
1720     We use m_file[0] as long as all partitions have the same storage engine.
1721   */
1722   DBUG_ASSERT(!strcmp(path, get_canonical_filename(m_file[0], path,
1723                                                    part_name_buff)));
1724   m_reorged_parts= 0;
1725   if (!m_part_info->is_sub_partitioned())
1726     num_subparts= 1;
1727 
1728   /*
1729     Step 1:
1730       Calculate number of reorganised partitions and allocate space for
1731       their handler references.
1732   */
1733   if (temp_partitions)
1734   {
1735     m_reorged_parts= temp_partitions * num_subparts;
1736   }
1737   else
1738   {
1739     do
1740     {
1741       partition_element *part_elem= part_it++;
1742       if (part_elem->part_state == PART_CHANGED ||
1743           part_elem->part_state == PART_REORGED_DROPPED)
1744       {
1745         m_reorged_parts+= num_subparts;
1746       }
1747     } while (++i < num_parts);
1748   }
1749   if (m_reorged_parts &&
1750       !(m_reorged_file= (handler**)sql_calloc(sizeof(handler*)*
1751                                               (m_reorged_parts + 1))))
1752   {
1753     mem_alloc_error(sizeof(handler*)*(m_reorged_parts+1));
1754     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
1755   }
1756 
1757   /*
1758     Step 2:
1759       Calculate number of partitions after change and allocate space for
1760       their handler references.
1761   */
1762   num_remain_partitions= 0;
1763   if (temp_partitions)
1764   {
1765     num_remain_partitions= num_parts * num_subparts;
1766   }
1767   else
1768   {
1769     part_it.rewind();
1770     i= 0;
1771     do
1772     {
1773       partition_element *part_elem= part_it++;
1774       if (part_elem->part_state == PART_NORMAL ||
1775           part_elem->part_state == PART_TO_BE_ADDED ||
1776           part_elem->part_state == PART_CHANGED)
1777       {
1778         num_remain_partitions+= num_subparts;
1779       }
1780     } while (++i < num_parts);
1781   }
1782   if (!(new_file_array= (handler**)sql_calloc(sizeof(handler*)*
1783                                             (2*(num_remain_partitions + 1)))))
1784   {
1785     mem_alloc_error(sizeof(handler*)*2*(num_remain_partitions+1));
1786     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
1787   }
1788   m_added_file= &new_file_array[num_remain_partitions + 1];
1789 
1790   /*
1791     Step 3:
1792       Fill m_reorged_file with handler references and NULL at the end
1793   */
1794   if (m_reorged_parts)
1795   {
1796     i= 0;
1797     part_count= 0;
1798     first= TRUE;
1799     part_it.rewind();
1800     do
1801     {
1802       partition_element *part_elem= part_it++;
1803       if (part_elem->part_state == PART_CHANGED ||
1804           part_elem->part_state == PART_REORGED_DROPPED)
1805       {
1806         memcpy((void*)&m_reorged_file[part_count],
1807                (void*)&m_file[i*num_subparts],
1808                sizeof(handler*)*num_subparts);
1809         part_count+= num_subparts;
1810       }
1811       else if (first && temp_partitions &&
1812                part_elem->part_state == PART_TO_BE_ADDED)
1813       {
1814         /*
1815           When doing an ALTER TABLE REORGANIZE PARTITION a number of
1816           partitions is to be reorganised into a set of new partitions.
1817           The reorganised partitions are in this case in the temp_partitions
1818           list. We copy all of them in one batch and thus we only do this
1819           until we find the first partition with state PART_TO_BE_ADDED
1820           since this is where the new partitions go in and where the old
1821           ones used to be.
1822         */
1823         first= FALSE;
1824         DBUG_ASSERT(((i*num_subparts) + m_reorged_parts) <= m_file_tot_parts);
1825         memcpy((void*)m_reorged_file, &m_file[i*num_subparts],
1826                sizeof(handler*)*m_reorged_parts);
1827       }
1828     } while (++i < num_parts);
1829   }
1830 
1831   /*
1832     Step 4:
1833       Fill new_array_file with handler references. Create the handlers if
1834       needed.
1835   */
1836   i= 0;
1837   part_count= 0;
1838   orig_count= 0;
1839   first= TRUE;
1840   part_it.rewind();
1841   do
1842   {
1843     partition_element *part_elem= part_it++;
1844     if (part_elem->part_state == PART_NORMAL)
1845     {
1846       DBUG_ASSERT(orig_count + num_subparts <= m_file_tot_parts);
1847       memcpy((void*)&new_file_array[part_count], (void*)&m_file[orig_count],
1848              sizeof(handler*)*num_subparts);
1849       part_count+= num_subparts;
1850       orig_count+= num_subparts;
1851     }
1852     else if (part_elem->part_state == PART_CHANGED ||
1853              part_elem->part_state == PART_TO_BE_ADDED)
1854     {
1855       uint j= 0;
1856       Parts_share_refs *p_share_refs;
1857       /*
1858         The Handler_shares for each partition's handler can be allocated
1859         within this handler, since there will not be any more instances of the
1860         new partitions, until the table is reopened after the ALTER succeeded.
1861       */
1862       p_share_refs= new Parts_share_refs;
1863       if (!p_share_refs)
1864         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
1865       if (p_share_refs->init(num_subparts))
1866         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
1867       if (m_new_partitions_share_refs.push_back(p_share_refs))
1868         DBUG_RETURN(HA_ERR_OUT_OF_MEM);
1869       do
1870       {
1871         handler **new_file= &new_file_array[part_count++];
1872         if (!(*new_file=
1873               get_new_handler(table->s,
1874                               thd->mem_root,
1875                               part_elem->engine_type)))
1876         {
1877           mem_alloc_error(sizeof(handler));
1878           DBUG_RETURN(HA_ERR_OUT_OF_MEM);
1879         }
1880         if ((*new_file)->set_ha_share_ref(&p_share_refs->ha_shares[j]))
1881         {
1882           DBUG_RETURN(HA_ERR_OUT_OF_MEM);
1883         }
1884       } while (++j < num_subparts);
1885       if (part_elem->part_state == PART_CHANGED)
1886         orig_count+= num_subparts;
1887       else if (temp_partitions && first)
1888       {
1889         orig_count+= (num_subparts * temp_partitions);
1890         first= FALSE;
1891       }
1892     }
1893   } while (++i < num_parts);
1894   first= FALSE;
1895   /*
1896     Step 5:
1897       Create the new partitions and also open, lock and call external_lock
1898       on them to prepare them for copy phase and also for later close
1899       calls
1900   */
1901 
1902   /*
1903      Before creating new partitions check whether indexes are disabled
1904      in the  partitions.
1905   */
1906 
1907   uint disable_non_uniq_indexes = indexes_are_disabled();
1908 
1909   i= 0;
1910   part_count= 0;
1911   part_it.rewind();
1912   do
1913   {
1914     partition_element *part_elem= part_it++;
1915     if (part_elem->part_state == PART_TO_BE_ADDED ||
1916         part_elem->part_state == PART_CHANGED)
1917     {
1918       /*
1919         A new partition needs to be created PART_TO_BE_ADDED means an
1920         entirely new partition and PART_CHANGED means a changed partition
1921         that will still exist with either more or less data in it.
1922       */
1923       uint name_variant= NORMAL_PART_NAME;
1924       if (part_elem->part_state == PART_CHANGED ||
1925           (part_elem->part_state == PART_TO_BE_ADDED && temp_partitions))
1926         name_variant= TEMP_PART_NAME;
1927       if (m_part_info->is_sub_partitioned())
1928       {
1929         List_iterator<partition_element> sub_it(part_elem->subpartitions);
1930         uint j= 0, part;
1931         do
1932         {
1933           partition_element *sub_elem= sub_it++;
1934           if ((error= create_subpartition_name(part_name_buff, path,
1935                                                part_elem->partition_name,
1936                                                sub_elem->partition_name,
1937                                                name_variant)))
1938           {
1939             cleanup_new_partition(part_count);
1940             DBUG_RETURN(error);
1941           }
1942 
1943           part= i * num_subparts + j;
1944           DBUG_PRINT("info", ("Add subpartition %s", part_name_buff));
1945           if ((error= prepare_new_partition(table, create_info,
1946                                             new_file_array[part],
1947                                             (const char *)part_name_buff,
1948                                             sub_elem,
1949                                             disable_non_uniq_indexes)))
1950           {
1951             cleanup_new_partition(part_count);
1952             DBUG_RETURN(error);
1953           }
1954 
1955           m_added_file[part_count++]= new_file_array[part];
1956         } while (++j < num_subparts);
1957       }
1958       else
1959       {
1960         if ((error= create_partition_name(part_name_buff, path,
1961                                           part_elem->partition_name,
1962                                           name_variant, TRUE)))
1963         {
1964           cleanup_new_partition(part_count);
1965           DBUG_RETURN(error);
1966         }
1967 
1968         DBUG_PRINT("info", ("Add partition %s", part_name_buff));
1969         if ((error= prepare_new_partition(table, create_info,
1970                                           new_file_array[i],
1971                                           (const char *)part_name_buff,
1972                                           part_elem,
1973                                           disable_non_uniq_indexes)))
1974         {
1975           cleanup_new_partition(part_count);
1976           DBUG_RETURN(error);
1977         }
1978 
1979         m_added_file[part_count++]= new_file_array[i];
1980       }
1981     }
1982   } while (++i < num_parts);
1983 
1984   /*
1985     Step 6:
1986       State update to prepare for next write of the frm file.
1987   */
1988   i= 0;
1989   part_it.rewind();
1990   do
1991   {
1992     partition_element *part_elem= part_it++;
1993     if (part_elem->part_state == PART_TO_BE_ADDED)
1994       part_elem->part_state= PART_IS_ADDED;
1995     else if (part_elem->part_state == PART_CHANGED)
1996       part_elem->part_state= PART_IS_CHANGED;
1997     else if (part_elem->part_state == PART_REORGED_DROPPED)
1998       part_elem->part_state= PART_TO_BE_DROPPED;
1999   } while (++i < num_parts);
2000   for (i= 0; i < temp_partitions; i++)
2001   {
2002     partition_element *part_elem= t_it++;
2003     DBUG_ASSERT(part_elem->part_state == PART_TO_BE_REORGED);
2004     part_elem->part_state= PART_TO_BE_DROPPED;
2005   }
2006   m_new_file= new_file_array;
2007   if ((error= copy_partitions(copied, deleted)))
2008   {
2009     /*
2010       Close and unlock the new temporary partitions.
2011       They will later be deleted through the ddl-log.
2012     */
2013     cleanup_new_partition(part_count);
2014   }
2015   DBUG_RETURN(error);
2016 }
2017 
2018 
2019 /*
2020   Copy partitions as part of ALTER TABLE of partitions
2021 
2022   SYNOPSIS
2023     copy_partitions()
2024     out:copied                 Number of records copied
2025     out:deleted                Number of records deleted
2026 
2027   RETURN VALUE
2028     >0                         Error code
2029     0                          Success
2030 
2031   DESCRIPTION
2032     change_partitions has done all the preparations, now it is time to
2033     actually copy the data from the reorganised partitions to the new
2034     partitions.
2035 */
2036 
copy_partitions(ulonglong * const copied,ulonglong * const deleted)2037 int ha_partition::copy_partitions(ulonglong * const copied,
2038                                   ulonglong * const deleted)
2039 {
2040   uint reorg_part= 0;
2041   int result= 0;
2042   longlong func_value;
2043   DBUG_ENTER("ha_partition::copy_partitions");
2044 
2045   if (m_part_info->linear_hash_ind)
2046   {
2047     if (m_part_info->part_type == HASH_PARTITION)
2048       set_linear_hash_mask(m_part_info, m_part_info->num_parts);
2049     else
2050       set_linear_hash_mask(m_part_info, m_part_info->num_subparts);
2051   }
2052 
2053   while (reorg_part < m_reorged_parts)
2054   {
2055     handler *file= m_reorged_file[reorg_part];
2056     uint32 new_part;
2057 
2058     late_extra_cache(reorg_part);
2059     if ((result= file->ha_rnd_init(1)))
2060       goto init_error;
2061     while (TRUE)
2062     {
2063       if ((result= file->ha_rnd_next(m_rec0)))
2064       {
2065         if (result == HA_ERR_RECORD_DELETED)
2066           continue;                              //Probably MyISAM
2067         if (result != HA_ERR_END_OF_FILE)
2068           goto error;
2069         /*
2070           End-of-file reached, break out to continue with next partition or
2071           end the copy process.
2072         */
2073         break;
2074       }
2075       /* Found record to insert into new handler */
2076       if (m_part_info->get_partition_id(m_part_info, &new_part,
2077                                         &func_value))
2078       {
2079         /*
2080            This record is in the original table but will not be in the new
2081            table since it doesn't fit into any partition any longer due to
2082            changed partitioning ranges or list values.
2083         */
2084         (*deleted)++;
2085       }
2086       else
2087       {
2088         THD *thd= ha_thd();
2089         /* Copy record to new handler */
2090         (*copied)++;
2091         tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
2092         result= m_new_file[new_part]->ha_write_row(m_rec0);
2093         reenable_binlog(thd);
2094         if (result)
2095           goto error;
2096       }
2097     }
2098     late_extra_no_cache(reorg_part);
2099     file->ha_rnd_end();
2100     reorg_part++;
2101   }
2102   DBUG_RETURN(FALSE);
2103 error:
2104   m_reorged_file[reorg_part]->ha_rnd_end();
2105 init_error:
2106   DBUG_RETURN(result);
2107 }
2108 
2109 /*
2110   Update create info as part of ALTER TABLE
2111 
2112   SYNOPSIS
2113     update_create_info()
2114     create_info                   Create info from ALTER TABLE
2115 
2116   RETURN VALUE
2117     NONE
2118 
2119   DESCRIPTION
2120   Forward this handler call to the storage engine foreach
2121   partition handler.  The data_file_name for each partition may
2122   need to be reset if the tablespace was moved.  Use a dummy
2123   HA_CREATE_INFO structure and transfer necessary data.
2124 */
2125 
update_create_info(HA_CREATE_INFO * create_info)2126 void ha_partition::update_create_info(HA_CREATE_INFO *create_info)
2127 {
2128   DBUG_ENTER("ha_partition::update_create_info");
2129 
2130   /*
2131     Fix for bug#38751, some engines needs info-calls in ALTER.
2132     Archive need this since it flushes in ::info.
2133     HA_STATUS_AUTO is optimized so it will not always be forwarded
2134     to all partitions, but HA_STATUS_VARIABLE will.
2135   */
2136   info(HA_STATUS_VARIABLE);
2137 
2138   info(HA_STATUS_AUTO);
2139 
2140   if (!(create_info->used_fields & HA_CREATE_USED_AUTO))
2141     create_info->auto_increment_value= stats.auto_increment_value;
2142 
2143   /*
2144     DATA DIRECTORY and INDEX DIRECTORY are never applied to the whole
2145     partitioned table, only its parts.
2146   */
2147   my_bool from_alter = (create_info->data_file_name == (const char*) -1);
2148   create_info->data_file_name= create_info->index_file_name = NULL;
2149 
2150   /*
2151   We do not need to update the individual partition DATA DIRECTORY settings
2152   since they can be changed by ALTER TABLE ... REORGANIZE PARTITIONS.
2153   */
2154   if (from_alter)
2155     DBUG_VOID_RETURN;
2156 
2157   /*
2158     send Handler::update_create_info() to the storage engine for each
2159     partition that currently has a handler object.  Using a dummy
2160     HA_CREATE_INFO structure to collect DATA and INDEX DIRECTORYs.
2161   */
2162 
2163   List_iterator<partition_element> part_it(m_part_info->partitions);
2164   partition_element *part_elem, *sub_elem;
2165   uint num_subparts= m_part_info->num_subparts;
2166   uint num_parts = num_subparts ? m_file_tot_parts / num_subparts
2167                                 : m_file_tot_parts;
2168   HA_CREATE_INFO dummy_info;
2169   memset(&dummy_info, 0, sizeof(dummy_info));
2170 
2171   /*
2172   Since update_create_info() can be called from mysql_prepare_alter_table()
2173   when not all handlers are set up, we look for that condition first.
2174   If all handlers are not available, do not call update_create_info for any.
2175   */
2176   uint i, j, part;
2177   for (i= 0; i < num_parts; i++)
2178   {
2179     part_elem= part_it++;
2180     if (!part_elem)
2181       DBUG_VOID_RETURN;
2182     if (m_is_sub_partitioned)
2183     {
2184       List_iterator<partition_element> subpart_it(part_elem->subpartitions);
2185       for (j= 0; j < num_subparts; j++)
2186       {
2187         sub_elem= subpart_it++;
2188         if (!sub_elem)
2189           DBUG_VOID_RETURN;
2190         part= i * num_subparts + j;
2191         if (part >= m_file_tot_parts || !m_file[part])
2192           DBUG_VOID_RETURN;
2193       }
2194     }
2195     else
2196     {
2197       if (!m_file[i])
2198         DBUG_VOID_RETURN;
2199     }
2200   }
2201   part_it.rewind();
2202 
2203   for (i= 0; i < num_parts; i++)
2204   {
2205     part_elem= part_it++;
2206     DBUG_ASSERT(part_elem);
2207     if (m_is_sub_partitioned)
2208     {
2209       List_iterator<partition_element> subpart_it(part_elem->subpartitions);
2210       for (j= 0; j < num_subparts; j++)
2211       {
2212         sub_elem= subpart_it++;
2213         DBUG_ASSERT(sub_elem);
2214         part= i * num_subparts + j;
2215         DBUG_ASSERT(part < m_file_tot_parts && m_file[part]);
2216         if (ha_legacy_type(m_file[part]->ht) == DB_TYPE_INNODB)
2217         {
2218           dummy_info.data_file_name= dummy_info.index_file_name = NULL;
2219           m_file[part]->update_create_info(&dummy_info);
2220 
2221           if (dummy_info.data_file_name || sub_elem->data_file_name)
2222           {
2223             sub_elem->data_file_name = (char*) dummy_info.data_file_name;
2224           }
2225           if (dummy_info.index_file_name || sub_elem->index_file_name)
2226           {
2227             sub_elem->index_file_name = (char*) dummy_info.index_file_name;
2228           }
2229         }
2230       }
2231     }
2232     else
2233     {
2234       DBUG_ASSERT(m_file[i]);
2235       if (ha_legacy_type(m_file[i]->ht) == DB_TYPE_INNODB)
2236       {
2237         dummy_info.data_file_name= dummy_info.index_file_name= NULL;
2238         m_file[i]->update_create_info(&dummy_info);
2239         if (dummy_info.data_file_name || part_elem->data_file_name)
2240         {
2241           part_elem->data_file_name = (char*) dummy_info.data_file_name;
2242         }
2243         if (dummy_info.index_file_name || part_elem->index_file_name)
2244         {
2245           part_elem->index_file_name = (char*) dummy_info.index_file_name;
2246         }
2247       }
2248     }
2249   }
2250   DBUG_VOID_RETURN;
2251 }
2252 
2253 
2254 /**
2255   Change the internal TABLE_SHARE pointer
2256 
2257   @param table_arg    TABLE object
2258   @param share        New share to use
2259 
2260   @note Is used in error handling in ha_delete_table.
2261   All handlers should exist (lock_partitions should not be used)
2262 */
2263 
change_table_ptr(TABLE * table_arg,TABLE_SHARE * share)2264 void ha_partition::change_table_ptr(TABLE *table_arg, TABLE_SHARE *share)
2265 {
2266   handler **file_array;
2267   table= table_arg;
2268   table_share= share;
2269   /*
2270     m_file can be NULL when using an old cached table in DROP TABLE, when the
2271     table just has REMOVED PARTITIONING, see Bug#42438
2272   */
2273   if (m_file)
2274   {
2275     file_array= m_file;
2276     DBUG_ASSERT(*file_array);
2277     do
2278     {
2279       (*file_array)->change_table_ptr(table_arg, share);
2280     } while (*(++file_array));
2281   }
2282 
2283   if (m_added_file && m_added_file[0])
2284   {
2285     /* if in middle of a drop/rename etc */
2286     file_array= m_added_file;
2287     do
2288     {
2289       (*file_array)->change_table_ptr(table_arg, share);
2290     } while (*(++file_array));
2291   }
2292 }
2293 
2294 /*
2295   Change comments specific to handler
2296 
2297   SYNOPSIS
2298     update_table_comment()
2299     comment                       Original comment
2300 
2301   RETURN VALUE
2302     new comment
2303 
2304   DESCRIPTION
2305     No comment changes so far
2306 */
2307 
update_table_comment(const char * comment)2308 char *ha_partition::update_table_comment(const char *comment)
2309 {
2310   return (char*) comment;                       /* Nothing to change */
2311 }
2312 
2313 
2314 /**
2315   Handle delete and rename table
2316 
2317     @param from         Full path of old table
2318     @param to           Full path of new table
2319 
2320   @return Operation status
2321     @retval >0  Error
2322     @retval 0   Success
2323 
2324   @note  Common routine to handle delete_table and rename_table.
2325   The routine uses the partition handler file to get the
2326   names of the partition instances. Both these routines
2327   are called after creating the handler without table
2328   object and thus the file is needed to discover the
2329   names of the partitions and the underlying storage engines.
2330 */
2331 
del_ren_table(const char * from,const char * to)2332 int ha_partition::del_ren_table(const char *from, const char *to)
2333 {
2334   int save_error= 0;
2335   int error= HA_ERR_INTERNAL_ERROR;
2336   char from_buff[FN_REFLEN + 1], to_buff[FN_REFLEN + 1], from_lc_buff[FN_REFLEN + 1],
2337        to_lc_buff[FN_REFLEN + 1], buff[FN_REFLEN + 1];
2338   char *name_buffer_ptr;
2339   const char *from_path;
2340   const char *to_path= NULL;
2341   uint i;
2342   handler **file, **abort_file;
2343   DBUG_ENTER("ha_partition::del_ren_table");
2344 
2345   fn_format(buff,from, "", ha_par_ext, MY_APPEND_EXT);
2346   /* Check if the  par file exists */
2347   if (my_access(buff,F_OK))
2348   {
2349     /*
2350       If the .par file does not exist, return HA_ERR_NO_SUCH_TABLE,
2351       This will signal to the caller that it can remove the .frm
2352       file.
2353     */
2354     error= HA_ERR_NO_SUCH_TABLE;
2355     DBUG_RETURN(error);
2356   }
2357 
2358   if (get_from_handler_file(from, ha_thd()->mem_root, false))
2359     DBUG_RETURN(error);
2360   DBUG_ASSERT(m_file_buffer);
2361   DBUG_PRINT("enter", ("from: (%s) to: (%s)", from, to ? to : "(nil)"));
2362   name_buffer_ptr= m_name_buffer_ptr;
2363   file= m_file;
2364   /*
2365     Since ha_partition has HA_FILE_BASED, it must alter underlying table names
2366     if they do not have HA_FILE_BASED and lower_case_table_names == 2.
2367     See Bug#37402, for Mac OS X.
2368     The appended #P#<partname>[#SP#<subpartname>] will remain in current case.
2369     Using the first partitions handler, since mixing handlers is not allowed.
2370   */
2371   from_path= get_canonical_filename(*file, from, from_lc_buff);
2372   if (to != NULL)
2373     to_path= get_canonical_filename(*file, to, to_lc_buff);
2374   i= 0;
2375   do
2376   {
2377     if ((error= create_partition_name(from_buff, from_path, name_buffer_ptr,
2378                                       NORMAL_PART_NAME, FALSE)))
2379       goto rename_error;
2380 
2381     if (to != NULL)
2382     {                                           // Rename branch
2383       if ((error= create_partition_name(to_buff, to_path, name_buffer_ptr,
2384                                         NORMAL_PART_NAME, FALSE)))
2385         goto rename_error;
2386 
2387       error= (*file)->ha_rename_table(from_buff, to_buff);
2388       if (error)
2389         goto rename_error;
2390     }
2391     else                                        // delete branch
2392     {
2393       error= (*file)->ha_delete_table(from_buff);
2394     }
2395     name_buffer_ptr= strend(name_buffer_ptr) + 1;
2396     if (error)
2397       save_error= error;
2398     i++;
2399   } while (*(++file));
2400 
2401   if (to == NULL)
2402   {
2403     DBUG_EXECUTE_IF("crash_before_deleting_par_file", DBUG_SUICIDE(););
2404 
2405     /* Delete the .par file. If error, break.*/
2406     if ((error= handler::delete_table(from)))
2407       DBUG_RETURN(error);
2408 
2409     DBUG_EXECUTE_IF("crash_after_deleting_par_file", DBUG_SUICIDE(););
2410   }
2411 
2412   if (to != NULL)
2413   {
2414     if ((error= handler::rename_table(from, to)))
2415     {
2416       /* Try to revert everything, ignore errors */
2417       (void) handler::rename_table(to, from);
2418       goto rename_error;
2419     }
2420   }
2421   DBUG_RETURN(save_error);
2422 rename_error:
2423   name_buffer_ptr= m_name_buffer_ptr;
2424   for (abort_file= file, file= m_file; file < abort_file; file++)
2425   {
2426     /* Revert the rename, back from 'to' to the original 'from' */
2427     if (!create_partition_name(from_buff, from_path, name_buffer_ptr,
2428                                NORMAL_PART_NAME, FALSE))
2429        if (!create_partition_name(to_buff, to_path, name_buffer_ptr,
2430                                   NORMAL_PART_NAME, FALSE))
2431          /* Ignore error here */
2432          (void) (*file)->ha_rename_table(to_buff, from_buff);
2433     name_buffer_ptr= strend(name_buffer_ptr) + 1;
2434   }
2435   DBUG_RETURN(error);
2436 }
2437 
2438 
2439 /**
2440   Set up table share object before calling create on underlying handler
2441 
2442   @param table             Table object
2443   @param info              Create info
2444   @param part_elem[in,out] Pointer to used partition_element, searched if NULL
2445 
2446   @return    status
2447     @retval  TRUE  Error
2448     @retval  FALSE Success
2449 
2450   @details
2451     Set up
2452     1) Comment on partition
2453     2) MAX_ROWS, MIN_ROWS on partition
2454     3) Index file name on partition
2455     4) Data file name on partition
2456 */
2457 
set_up_table_before_create(TABLE * tbl,const char * partition_name_with_path,HA_CREATE_INFO * info,partition_element * part_elem)2458 int ha_partition::set_up_table_before_create(TABLE *tbl,
2459                     const char *partition_name_with_path,
2460                     HA_CREATE_INFO *info,
2461                     partition_element *part_elem)
2462 {
2463   int error= 0;
2464   const char *partition_name;
2465   THD *thd= ha_thd();
2466   DBUG_ENTER("set_up_table_before_create");
2467 
2468   DBUG_ASSERT(part_elem);
2469 
2470   if (!part_elem)
2471     DBUG_RETURN(1);
2472   tbl->s->max_rows= part_elem->part_max_rows;
2473   tbl->s->min_rows= part_elem->part_min_rows;
2474   partition_name= strrchr(partition_name_with_path, FN_LIBCHAR);
2475   if ((part_elem->index_file_name &&
2476       (error= append_file_to_dir(thd,
2477                                  (const char**)&part_elem->index_file_name,
2478                                  partition_name+1))) ||
2479       (part_elem->data_file_name &&
2480       (error= append_file_to_dir(thd,
2481                                  (const char**)&part_elem->data_file_name,
2482                                  partition_name+1))))
2483   {
2484     DBUG_RETURN(error);
2485   }
2486   info->index_file_name= part_elem->index_file_name;
2487   info->data_file_name= part_elem->data_file_name;
2488   DBUG_RETURN(0);
2489 }
2490 
2491 
2492 /*
2493   Add two names together
2494 
2495   SYNOPSIS
2496     name_add()
2497     out:dest                          Destination string
2498     first_name                        First name
2499     sec_name                          Second name
2500 
2501   RETURN VALUE
2502     >0                                Error
2503     0                                 Success
2504 
2505   DESCRIPTION
2506     Routine used to add two names with '_' in between then. Service routine
2507     to create_handler_file
2508     Include the NULL in the count of characters since it is needed as separator
2509     between the partition names.
2510 */
2511 
name_add(char * dest,const char * first_name,const char * sec_name)2512 static uint name_add(char *dest, const char *first_name, const char *sec_name)
2513 {
2514   return (uint) (strxmov(dest, first_name, "#SP#", sec_name, NullS) -dest) + 1;
2515 }
2516 
2517 
2518 /**
2519   Create the special .par file
2520 
2521   @param name  Full path of table name
2522 
2523   @return Operation status
2524     @retval FALSE  Error code
2525     @retval TRUE   Success
2526 
2527   @note
2528     Method used to create handler file with names of partitions, their
2529     engine types and the number of partitions.
2530 */
2531 
create_handler_file(const char * name)2532 bool ha_partition::create_handler_file(const char *name)
2533 {
2534   partition_element *part_elem, *subpart_elem;
2535   uint i, j, part_name_len, subpart_name_len;
2536   uint tot_partition_words, tot_name_len, num_parts;
2537   uint tot_parts= 0;
2538   uint tot_len_words, tot_len_byte, chksum, tot_name_words;
2539   char *name_buffer_ptr;
2540   uchar *file_buffer, *engine_array;
2541   bool result= TRUE;
2542   char file_name[FN_REFLEN];
2543   char part_name[FN_REFLEN];
2544   char subpart_name[FN_REFLEN];
2545   File file;
2546   List_iterator_fast <partition_element> part_it(m_part_info->partitions);
2547   DBUG_ENTER("create_handler_file");
2548 
2549   num_parts= m_part_info->partitions.elements;
2550   DBUG_PRINT("info", ("table name = %s, num_parts = %u", name,
2551                       num_parts));
2552   tot_name_len= 0;
2553   for (i= 0; i < num_parts; i++)
2554   {
2555     part_elem= part_it++;
2556     if (part_elem->part_state != PART_NORMAL &&
2557         part_elem->part_state != PART_TO_BE_ADDED &&
2558         part_elem->part_state != PART_CHANGED)
2559       continue;
2560     tablename_to_filename(part_elem->partition_name, part_name,
2561                           FN_REFLEN);
2562     part_name_len= strlen(part_name);
2563     if (!m_is_sub_partitioned)
2564     {
2565       tot_name_len+= part_name_len + 1;
2566       tot_parts++;
2567     }
2568     else
2569     {
2570       List_iterator_fast <partition_element> sub_it(part_elem->subpartitions);
2571       for (j= 0; j < m_part_info->num_subparts; j++)
2572       {
2573 	subpart_elem= sub_it++;
2574         tablename_to_filename(subpart_elem->partition_name,
2575                               subpart_name,
2576                               FN_REFLEN);
2577 	subpart_name_len= strlen(subpart_name);
2578 	tot_name_len+= part_name_len + subpart_name_len + 5;
2579         tot_parts++;
2580       }
2581     }
2582   }
2583   /*
2584      File format:
2585      Length in words              4 byte
2586      Checksum                     4 byte
2587      Total number of partitions   4 byte
2588      Array of engine types        n * 4 bytes where
2589      n = (m_tot_parts + 3)/4
2590      Length of name part in bytes 4 bytes
2591      (Names in filename format)
2592      Name part                    m * 4 bytes where
2593      m = ((length_name_part + 3)/4)*4
2594 
2595      All padding bytes are zeroed
2596   */
2597   tot_partition_words= (tot_parts + PAR_WORD_SIZE - 1) / PAR_WORD_SIZE;
2598   tot_name_words= (tot_name_len + PAR_WORD_SIZE - 1) / PAR_WORD_SIZE;
2599   /* 4 static words (tot words, checksum, tot partitions, name length) */
2600   tot_len_words= 4 + tot_partition_words + tot_name_words;
2601   tot_len_byte= PAR_WORD_SIZE * tot_len_words;
2602   if (!(file_buffer= (uchar *) my_malloc(tot_len_byte, MYF(MY_ZEROFILL))))
2603     DBUG_RETURN(TRUE);
2604   engine_array= (file_buffer + PAR_ENGINES_OFFSET);
2605   name_buffer_ptr= (char*) (engine_array + tot_partition_words * PAR_WORD_SIZE
2606                             + PAR_WORD_SIZE);
2607   part_it.rewind();
2608   for (i= 0; i < num_parts; i++)
2609   {
2610     part_elem= part_it++;
2611     if (part_elem->part_state != PART_NORMAL &&
2612         part_elem->part_state != PART_TO_BE_ADDED &&
2613         part_elem->part_state != PART_CHANGED)
2614       continue;
2615     if (!m_is_sub_partitioned)
2616     {
2617       tablename_to_filename(part_elem->partition_name, part_name, FN_REFLEN);
2618       name_buffer_ptr= strmov(name_buffer_ptr, part_name)+1;
2619       *engine_array= (uchar) ha_legacy_type(part_elem->engine_type);
2620       DBUG_PRINT("info", ("engine: %u", *engine_array));
2621       engine_array++;
2622     }
2623     else
2624     {
2625       List_iterator_fast <partition_element> sub_it(part_elem->subpartitions);
2626       for (j= 0; j < m_part_info->num_subparts; j++)
2627       {
2628 	subpart_elem= sub_it++;
2629         tablename_to_filename(part_elem->partition_name, part_name,
2630                               FN_REFLEN);
2631         tablename_to_filename(subpart_elem->partition_name, subpart_name,
2632                               FN_REFLEN);
2633 	name_buffer_ptr+= name_add(name_buffer_ptr,
2634 				   part_name,
2635 				   subpart_name);
2636         *engine_array= (uchar) ha_legacy_type(subpart_elem->engine_type);
2637         DBUG_PRINT("info", ("engine: %u", *engine_array));
2638 	engine_array++;
2639       }
2640     }
2641   }
2642   chksum= 0;
2643   int4store(file_buffer, tot_len_words);
2644   int4store(file_buffer + PAR_NUM_PARTS_OFFSET, tot_parts);
2645   int4store(file_buffer + PAR_ENGINES_OFFSET +
2646             (tot_partition_words * PAR_WORD_SIZE),
2647             tot_name_len);
2648   for (i= 0; i < tot_len_words; i++)
2649     chksum^= uint4korr(file_buffer + PAR_WORD_SIZE * i);
2650   int4store(file_buffer + PAR_CHECKSUM_OFFSET, chksum);
2651   /*
2652     Add .par extension to the file name.
2653     Create and write and close file
2654     to be used at open, delete_table and rename_table
2655   */
2656   fn_format(file_name, name, "", ha_par_ext, MY_APPEND_EXT);
2657   if ((file= mysql_file_create(key_file_partition,
2658                                file_name, CREATE_MODE, O_RDWR | O_TRUNC,
2659                                MYF(MY_WME))) >= 0)
2660   {
2661     result= mysql_file_write(file, (uchar *) file_buffer, tot_len_byte,
2662                              MYF(MY_WME | MY_NABP)) != 0;
2663     (void) mysql_file_close(file, MYF(0));
2664   }
2665   else
2666     result= TRUE;
2667   my_free(file_buffer);
2668   DBUG_RETURN(result);
2669 }
2670 
2671 
2672 /**
2673   Clear handler variables and free some memory
2674 */
2675 
clear_handler_file()2676 void ha_partition::clear_handler_file()
2677 {
2678   if (m_engine_array)
2679   {
2680     plugin_unlock_list(NULL, m_engine_array, m_tot_parts);
2681     my_free(m_engine_array);
2682     m_engine_array= NULL;
2683   }
2684   if (m_file_buffer)
2685   {
2686     my_free(m_file_buffer);
2687     m_file_buffer= NULL;
2688   }
2689 }
2690 
2691 
2692 /**
2693   Create underlying handler objects
2694 
2695   @param mem_root  Allocate memory through this
2696 
2697   @return Operation status
2698     @retval TRUE   Error
2699     @retval FALSE  Success
2700 */
2701 
create_handlers(MEM_ROOT * mem_root)2702 bool ha_partition::create_handlers(MEM_ROOT *mem_root)
2703 {
2704   uint i;
2705   uint alloc_len= (m_tot_parts + 1) * sizeof(handler*);
2706   handlerton *hton0;
2707   DBUG_ENTER("create_handlers");
2708 
2709   if (!(m_file= (handler **) alloc_root(mem_root, alloc_len)))
2710     DBUG_RETURN(TRUE);
2711   m_file_tot_parts= m_tot_parts;
2712   memset(m_file, 0, alloc_len);
2713   for (i= 0; i < m_tot_parts; i++)
2714   {
2715     handlerton *hton= plugin_data(m_engine_array[i], handlerton*);
2716     if (!(m_file[i]= get_new_handler(table_share, mem_root, hton)))
2717       DBUG_RETURN(TRUE);
2718     DBUG_PRINT("info", ("engine_type: %u", hton->db_type));
2719   }
2720   /* For the moment we only support partition over the same table engine */
2721   hton0= plugin_data(m_engine_array[0], handlerton*);
2722   if (hton0 == myisam_hton)
2723   {
2724     DBUG_PRINT("info", ("MyISAM"));
2725     m_myisam= TRUE;
2726   }
2727   /* INNODB may not be compiled in... */
2728   else if (ha_legacy_type(hton0) == DB_TYPE_INNODB)
2729   {
2730     DBUG_PRINT("info", ("InnoDB"));
2731     m_innodb= TRUE;
2732   }
2733   DBUG_RETURN(FALSE);
2734 }
2735 
2736 
2737 /*
2738   Create underlying handler objects from partition info
2739 
2740   SYNOPSIS
2741     new_handlers_from_part_info()
2742     mem_root		Allocate memory through this
2743 
2744   RETURN VALUE
2745     TRUE                  Error
2746     FALSE                 Success
2747 */
2748 
new_handlers_from_part_info(MEM_ROOT * mem_root)2749 bool ha_partition::new_handlers_from_part_info(MEM_ROOT *mem_root)
2750 {
2751   uint i, j, part_count;
2752   partition_element *part_elem;
2753   uint alloc_len= (m_tot_parts + 1) * sizeof(handler*);
2754   List_iterator_fast <partition_element> part_it(m_part_info->partitions);
2755   DBUG_ENTER("ha_partition::new_handlers_from_part_info");
2756 
2757   if (!(m_file= (handler **) alloc_root(mem_root, alloc_len)))
2758   {
2759     mem_alloc_error(alloc_len);
2760     goto error_end;
2761   }
2762   m_file_tot_parts= m_tot_parts;
2763   memset(m_file, 0, alloc_len);
2764   DBUG_ASSERT(m_part_info->num_parts > 0);
2765 
2766   i= 0;
2767   part_count= 0;
2768   /*
2769     Don't know the size of the underlying storage engine, invent a number of
2770     bytes allocated for error message if allocation fails
2771   */
2772   do
2773   {
2774     part_elem= part_it++;
2775     if (m_is_sub_partitioned)
2776     {
2777       for (j= 0; j < m_part_info->num_subparts; j++)
2778       {
2779 	if (!(m_file[part_count++]= get_new_handler(table_share, mem_root,
2780                                                     part_elem->engine_type)))
2781           goto error;
2782 	DBUG_PRINT("info", ("engine_type: %u",
2783                    (uint) ha_legacy_type(part_elem->engine_type)));
2784       }
2785     }
2786     else
2787     {
2788       if (!(m_file[part_count++]= get_new_handler(table_share, mem_root,
2789                                                   part_elem->engine_type)))
2790         goto error;
2791       DBUG_PRINT("info", ("engine_type: %u",
2792                  (uint) ha_legacy_type(part_elem->engine_type)));
2793     }
2794   } while (++i < m_part_info->num_parts);
2795   if (part_elem->engine_type == myisam_hton)
2796   {
2797     DBUG_PRINT("info", ("MyISAM"));
2798     m_myisam= TRUE;
2799   }
2800   DBUG_RETURN(FALSE);
2801 error:
2802   mem_alloc_error(sizeof(handler));
2803 error_end:
2804   DBUG_RETURN(TRUE);
2805 }
2806 
2807 
2808 /**
2809   Read the .par file to get the partitions engines and names
2810 
2811   @param name  Name of table file (without extention)
2812 
2813   @return Operation status
2814     @retval true   Failure
2815     @retval false  Success
2816 
2817   @note On success, m_file_buffer is allocated and must be
2818   freed by the caller. m_name_buffer_ptr and m_tot_parts is also set.
2819 */
2820 
read_par_file(const char * name)2821 bool ha_partition::read_par_file(const char *name)
2822 {
2823   char buff[FN_REFLEN], *tot_name_len_offset, *buff_p= buff;
2824   File file;
2825   char *file_buffer;
2826   uint i, len_bytes, len_words, tot_partition_words, tot_name_words, chksum;
2827   DBUG_ENTER("ha_partition::read_par_file");
2828   DBUG_PRINT("enter", ("table name: '%s'", name));
2829 
2830   if (m_file_buffer)
2831     DBUG_RETURN(false);
2832   fn_format(buff, name, "", ha_par_ext, MY_APPEND_EXT);
2833 
2834   /* Following could be done with mysql_file_stat to read in whole file */
2835   if ((file= mysql_file_open(key_file_partition,
2836                              buff, O_RDONLY | O_SHARE, MYF(0))) < 0)
2837     DBUG_RETURN(TRUE);
2838   if (mysql_file_read(file, (uchar *) &buff[0], PAR_WORD_SIZE, MYF(MY_NABP)))
2839     goto err1;
2840   len_words= uint4korr(buff_p);
2841   len_bytes= PAR_WORD_SIZE * len_words;
2842   if (mysql_file_seek(file, 0, MY_SEEK_SET, MYF(0)) == MY_FILEPOS_ERROR)
2843     goto err1;
2844   if (!(file_buffer= (char*) my_malloc(len_bytes, MYF(0))))
2845     goto err1;
2846   if (mysql_file_read(file, (uchar *) file_buffer, len_bytes, MYF(MY_NABP)))
2847     goto err2;
2848 
2849   chksum= 0;
2850   for (i= 0; i < len_words; i++)
2851     chksum ^= uint4korr((file_buffer) + PAR_WORD_SIZE * i);
2852   if (chksum)
2853     goto err2;
2854   m_tot_parts= uint4korr((file_buffer) + PAR_NUM_PARTS_OFFSET);
2855   DBUG_PRINT("info", ("No of parts = %u", m_tot_parts));
2856   tot_partition_words= (m_tot_parts + PAR_WORD_SIZE - 1) / PAR_WORD_SIZE;
2857 
2858   tot_name_len_offset= file_buffer + PAR_ENGINES_OFFSET +
2859                        PAR_WORD_SIZE * tot_partition_words;
2860   tot_name_words= (uint4korr(tot_name_len_offset) + PAR_WORD_SIZE - 1) /
2861                   PAR_WORD_SIZE;
2862   /*
2863     Verify the total length = tot size word, checksum word, num parts word +
2864     engines array + name length word + name array.
2865   */
2866   if (len_words != (tot_partition_words + tot_name_words + 4))
2867     goto err2;
2868   (void) mysql_file_close(file, MYF(0));
2869   m_file_buffer= file_buffer;          // Will be freed in clear_handler_file()
2870   m_name_buffer_ptr= tot_name_len_offset + PAR_WORD_SIZE;
2871 
2872   DBUG_RETURN(false);
2873 
2874 err2:
2875   my_free(file_buffer);
2876 err1:
2877   (void) mysql_file_close(file, MYF(0));
2878   DBUG_RETURN(true);
2879 }
2880 
2881 
2882 /**
2883   Setup m_engine_array
2884 
2885   @param mem_root  MEM_ROOT to use for allocating new handlers
2886 
2887   @return Operation status
2888     @retval false  Success
2889     @retval true   Failure
2890 */
2891 
setup_engine_array(MEM_ROOT * mem_root)2892 bool ha_partition::setup_engine_array(MEM_ROOT *mem_root)
2893 {
2894   uint i;
2895   uchar *buff;
2896   handlerton **engine_array, *first_engine;
2897   enum legacy_db_type db_type, first_db_type;
2898 
2899   DBUG_ASSERT(!m_file);
2900   DBUG_ENTER("ha_partition::setup_engine_array");
2901   engine_array= (handlerton **) my_alloca(m_tot_parts * sizeof(handlerton*));
2902   if (!engine_array)
2903     DBUG_RETURN(true);
2904 
2905   buff= (uchar *) (m_file_buffer + PAR_ENGINES_OFFSET);
2906   first_db_type= (enum legacy_db_type) buff[0];
2907   first_engine= ha_resolve_by_legacy_type(ha_thd(), first_db_type);
2908   if (!first_engine)
2909     goto err;
2910 
2911   if (!(m_engine_array= (plugin_ref*)
2912                 my_malloc(m_tot_parts * sizeof(plugin_ref), MYF(MY_WME))))
2913     goto err;
2914 
2915   for (i= 0; i < m_tot_parts; i++)
2916   {
2917     db_type= (enum legacy_db_type) buff[i];
2918     if (db_type != first_db_type)
2919     {
2920       DBUG_PRINT("error", ("partition %u engine %d is not same as "
2921                            "first partition %d", i, db_type,
2922                            (int) first_db_type));
2923       DBUG_ASSERT(0);
2924       clear_handler_file();
2925       goto err;
2926     }
2927     m_engine_array[i]= ha_lock_engine(NULL, first_engine);
2928     if (!m_engine_array[i])
2929     {
2930       clear_handler_file();
2931       goto err;
2932     }
2933   }
2934 
2935   my_afree((gptr) engine_array);
2936 
2937   if (create_handlers(mem_root))
2938   {
2939     clear_handler_file();
2940     DBUG_RETURN(true);
2941   }
2942 
2943   DBUG_RETURN(false);
2944 
2945 err:
2946   my_afree((gptr) engine_array);
2947   DBUG_RETURN(true);
2948 }
2949 
2950 
2951 /**
2952   Get info about partition engines and their names from the .par file
2953 
2954   @param name      Full path of table name
2955   @param mem_root  Allocate memory through this
2956   @param is_clone  If it is a clone, don't create new handlers
2957 
2958   @return Operation status
2959     @retval true   Error
2960     @retval false  Success
2961 
2962   @note Open handler file to get partition names, engine types and number of
2963   partitions.
2964 */
2965 
get_from_handler_file(const char * name,MEM_ROOT * mem_root,bool is_clone)2966 bool ha_partition::get_from_handler_file(const char *name, MEM_ROOT *mem_root,
2967                                          bool is_clone)
2968 {
2969   DBUG_ENTER("ha_partition::get_from_handler_file");
2970   DBUG_PRINT("enter", ("table name: '%s'", name));
2971 
2972   if (m_file_buffer)
2973     DBUG_RETURN(false);
2974 
2975   if (read_par_file(name))
2976     DBUG_RETURN(true);
2977 
2978   if (!is_clone && setup_engine_array(mem_root))
2979     DBUG_RETURN(true);
2980 
2981   DBUG_RETURN(false);
2982 }
2983 
2984 
2985 /****************************************************************************
2986                 MODULE open/close object
2987 ****************************************************************************/
2988 
2989 /**
2990   Get the partition name.
2991 
2992   @param       part   Struct containing name and length
2993   @param[out]  length Length of the name
2994 
2995   @return Partition name
2996 */
2997 
get_part_name(PART_NAME_DEF * part,size_t * length,my_bool not_used MY_ATTRIBUTE ((unused)))2998 static uchar *get_part_name(PART_NAME_DEF *part, size_t *length,
2999                             my_bool not_used MY_ATTRIBUTE((unused)))
3000 {
3001   *length= part->length;
3002   return part->partition_name;
3003 }
3004 
3005 
3006 /**
3007   Insert a partition name in the partition_name_hash.
3008 
3009   @param name        Name of partition
3010   @param part_id     Partition id (number)
3011   @param is_subpart  Set if the name belongs to a subpartition
3012 
3013   @return Operation status
3014     @retval true   Failure
3015     @retval false  Sucess
3016 */
3017 
insert_partition_name_in_hash(const char * name,uint part_id,bool is_subpart)3018 bool ha_partition::insert_partition_name_in_hash(const char *name, uint part_id,
3019                                                  bool is_subpart)
3020 {
3021   PART_NAME_DEF *part_def;
3022   uchar *part_name;
3023   uint part_name_length;
3024   DBUG_ENTER("ha_partition::insert_partition_name_in_hash");
3025   /*
3026     Calculate and store the length here, to avoid doing it when
3027     searching the hash.
3028   */
3029   part_name_length= strlen(name);
3030   /*
3031     Must use memory that lives as long as table_share.
3032     Freed in the Partition_share destructor.
3033     Since we use my_multi_malloc, then my_free(part_def) will also free
3034     part_name, as a part of my_hash_free.
3035   */
3036   if (!my_multi_malloc(MY_WME,
3037                        &part_def, sizeof(PART_NAME_DEF),
3038                        &part_name, part_name_length + 1,
3039                        NULL))
3040     DBUG_RETURN(true);
3041   memcpy(part_name, name, part_name_length + 1);
3042   part_def->partition_name= part_name;
3043   part_def->length= part_name_length;
3044   part_def->part_id= part_id;
3045   part_def->is_subpart= is_subpart;
3046   if (my_hash_insert(&part_share->partition_name_hash, (uchar *) part_def))
3047   {
3048     my_free(part_def);
3049     DBUG_RETURN(true);
3050   }
3051   DBUG_RETURN(false);
3052 }
3053 
3054 
3055 /**
3056   Populate the partition_name_hash in part_share.
3057 */
3058 
populate_partition_name_hash()3059 bool ha_partition::populate_partition_name_hash()
3060 {
3061   List_iterator<partition_element> part_it(m_part_info->partitions);
3062   uint num_parts= m_part_info->num_parts;
3063   uint num_subparts= m_is_sub_partitioned ? m_part_info->num_subparts : 1;
3064   uint tot_names;
3065   uint i= 0;
3066   DBUG_ASSERT(part_share);
3067 
3068   DBUG_ENTER("ha_partition::populate_partition_name_hash");
3069 
3070   /*
3071     partition_name_hash is only set once and never changed
3072     -> OK to check without locking.
3073   */
3074 
3075   if (part_share->partition_name_hash_initialized)
3076     DBUG_RETURN(false);
3077   lock_shared_ha_data();
3078   if (part_share->partition_name_hash_initialized)
3079   {
3080     unlock_shared_ha_data();
3081     DBUG_RETURN(false);
3082   }
3083   tot_names= m_is_sub_partitioned ? m_tot_parts + num_parts : num_parts;
3084   if (my_hash_init(&part_share->partition_name_hash,
3085                    system_charset_info, tot_names, 0, 0,
3086                    (my_hash_get_key) get_part_name,
3087                    my_free, HASH_UNIQUE))
3088   {
3089     unlock_shared_ha_data();
3090     DBUG_RETURN(TRUE);
3091   }
3092 
3093   do
3094   {
3095     partition_element *part_elem= part_it++;
3096     DBUG_ASSERT(part_elem->part_state == PART_NORMAL);
3097     if (part_elem->part_state == PART_NORMAL)
3098     {
3099       if (insert_partition_name_in_hash(part_elem->partition_name,
3100                                         i * num_subparts, false))
3101         goto err;
3102       if (m_is_sub_partitioned)
3103       {
3104         List_iterator<partition_element>
3105                                     subpart_it(part_elem->subpartitions);
3106         partition_element *sub_elem;
3107         uint j= 0;
3108         do
3109         {
3110           sub_elem= subpart_it++;
3111           if (insert_partition_name_in_hash(sub_elem->partition_name,
3112                                             i * num_subparts + j, true))
3113             goto err;
3114 
3115         } while (++j < num_subparts);
3116       }
3117     }
3118   } while (++i < num_parts);
3119 
3120   part_share->partition_name_hash_initialized= true;
3121   unlock_shared_ha_data();
3122 
3123   DBUG_RETURN(FALSE);
3124 err:
3125   my_hash_free(&part_share->partition_name_hash);
3126   unlock_shared_ha_data();
3127 
3128   DBUG_RETURN(TRUE);
3129 }
3130 
3131 
3132 /**
3133   Set Handler_share pointer and allocate Handler_share pointers
3134   for each partition and set those.
3135 
3136   @param ha_share_arg  Where to store/retrieve the Partitioning_share pointer
3137                        to be shared by all instances of the same table.
3138 
3139   @return Operation status
3140     @retval true  Failure
3141     @retval false Sucess
3142 */
3143 
set_ha_share_ref(Handler_share ** ha_share_arg)3144 bool ha_partition::set_ha_share_ref(Handler_share **ha_share_arg)
3145 {
3146   Handler_share **ha_shares;
3147   uint i;
3148   DBUG_ENTER("ha_partition::set_ha_share_ref");
3149 
3150   DBUG_ASSERT(!part_share);
3151   DBUG_ASSERT(table_share);
3152   DBUG_ASSERT(!m_is_clone_of);
3153   DBUG_ASSERT(m_tot_parts);
3154   if (handler::set_ha_share_ref(ha_share_arg))
3155     DBUG_RETURN(true);
3156   if (!(part_share= get_share()))
3157     DBUG_RETURN(true);
3158   DBUG_ASSERT(part_share->partitions_share_refs);
3159   DBUG_ASSERT(part_share->partitions_share_refs->num_parts >= m_tot_parts);
3160   ha_shares= part_share->partitions_share_refs->ha_shares;
3161   for (i= 0; i < m_tot_parts; i++)
3162   {
3163     if (m_file[i]->set_ha_share_ref(&ha_shares[i]))
3164       DBUG_RETURN(true);
3165   }
3166   DBUG_RETURN(false);
3167 }
3168 
3169 
3170 /**
3171   Get the PARTITION_SHARE for the table.
3172 
3173   @return Operation status
3174     @retval true   Error
3175     @retval false  Success
3176 
3177   @note Gets or initializes the Partition_share object used by partitioning.
3178   The Partition_share is used for handling the auto_increment etc.
3179 */
3180 
get_share()3181 Partition_share *ha_partition::get_share()
3182 {
3183   Partition_share *tmp_share;
3184   DBUG_ENTER("ha_partition::get_share");
3185   DBUG_ASSERT(table_share);
3186 
3187   lock_shared_ha_data();
3188   if (!(tmp_share= static_cast<Partition_share*>(get_ha_share_ptr())))
3189   {
3190     tmp_share= new Partition_share;
3191     if (!tmp_share)
3192       goto err;
3193     if (tmp_share->init(m_tot_parts))
3194     {
3195       delete tmp_share;
3196       tmp_share= NULL;
3197       goto err;
3198     }
3199     set_ha_share_ptr(static_cast<Handler_share*>(tmp_share));
3200   }
3201 err:
3202   unlock_shared_ha_data();
3203   DBUG_RETURN(tmp_share);
3204 }
3205 
3206 
3207 
3208 /**
3209   Helper function for freeing all internal bitmaps.
3210 */
3211 
free_partition_bitmaps()3212 void ha_partition::free_partition_bitmaps()
3213 {
3214   /* Initialize the bitmap we use to minimize ha_start_bulk_insert calls */
3215   bitmap_free(&m_bulk_insert_started);
3216   bitmap_free(&m_locked_partitions);
3217   bitmap_free(&m_partitions_to_reset);
3218   bitmap_free(&m_key_not_found_partitions);
3219 }
3220 
3221 
3222 /**
3223   Helper function for initializing all internal bitmaps.
3224 */
3225 
init_partition_bitmaps()3226 bool ha_partition::init_partition_bitmaps()
3227 {
3228   DBUG_ENTER("ha_partition::init_partition_bitmaps");
3229   /* Initialize the bitmap we use to minimize ha_start_bulk_insert calls */
3230   if (bitmap_init(&m_bulk_insert_started, NULL, m_tot_parts + 1, FALSE))
3231     DBUG_RETURN(true);
3232   bitmap_clear_all(&m_bulk_insert_started);
3233 
3234   /* Initialize the bitmap we use to keep track of locked partitions */
3235   if (bitmap_init(&m_locked_partitions, NULL, m_tot_parts, FALSE))
3236   {
3237     bitmap_free(&m_bulk_insert_started);
3238     DBUG_RETURN(true);
3239   }
3240   bitmap_clear_all(&m_locked_partitions);
3241 
3242   /*
3243     Initialize the bitmap we use to keep track of partitions which may have
3244     something to reset in ha_reset().
3245   */
3246   if (bitmap_init(&m_partitions_to_reset, NULL, m_tot_parts, FALSE))
3247   {
3248     bitmap_free(&m_bulk_insert_started);
3249     bitmap_free(&m_locked_partitions);
3250     DBUG_RETURN(true);
3251   }
3252   bitmap_clear_all(&m_partitions_to_reset);
3253 
3254   /*
3255     Initialize the bitmap we use to keep track of partitions which returned
3256     HA_ERR_KEY_NOT_FOUND from index_read_map.
3257   */
3258   if (bitmap_init(&m_key_not_found_partitions, NULL, m_tot_parts, FALSE))
3259   {
3260     bitmap_free(&m_bulk_insert_started);
3261     bitmap_free(&m_locked_partitions);
3262     bitmap_free(&m_partitions_to_reset);
3263     DBUG_RETURN(true);
3264   }
3265   bitmap_clear_all(&m_key_not_found_partitions);
3266   m_key_not_found= false;
3267   /* Initialize the bitmap for read/lock_partitions */
3268   if (!m_is_clone_of)
3269   {
3270     DBUG_ASSERT(!m_clone_mem_root);
3271     if (m_part_info->set_partition_bitmaps(NULL))
3272     {
3273       free_partition_bitmaps();
3274       DBUG_RETURN(true);
3275     }
3276   }
3277   DBUG_RETURN(false);
3278 }
3279 
3280 
3281 /*
3282   Open handler object
3283 
3284   SYNOPSIS
3285     open()
3286     name                  Full path of table name
3287     mode                  Open mode flags
3288     test_if_locked        ?
3289 
3290   RETURN VALUE
3291     >0                    Error
3292     0                     Success
3293 
3294   DESCRIPTION
3295     Used for opening tables. The name will be the name of the file.
3296     A table is opened when it needs to be opened. For instance
3297     when a request comes in for a select on the table (tables are not
3298     open and closed for each request, they are cached).
3299 
3300     Called from handler.cc by handler::ha_open(). The server opens all tables
3301     by calling ha_open() which then calls the handler specific open().
3302 */
3303 
open(const char * name,int mode,uint test_if_locked)3304 int ha_partition::open(const char *name, int mode, uint test_if_locked)
3305 {
3306   char *name_buffer_ptr;
3307   int error= HA_ERR_INITIALIZATION;
3308   handler **file;
3309   char name_buff[FN_REFLEN + 1];
3310   ulonglong check_table_flags;
3311   DBUG_ENTER("ha_partition::open");
3312 
3313   DBUG_ASSERT(table->s == table_share);
3314   ref_length= 0;
3315   m_mode= mode;
3316   m_open_test_lock= test_if_locked;
3317   m_part_field_array= m_part_info->full_part_field_array;
3318   if (get_from_handler_file(name, &table->mem_root, MY_TEST(m_is_clone_of)))
3319     DBUG_RETURN(error);
3320   name_buffer_ptr= m_name_buffer_ptr;
3321   if (populate_partition_name_hash())
3322   {
3323     DBUG_RETURN(HA_ERR_INITIALIZATION);
3324   }
3325   m_start_key.length= 0;
3326   m_rec0= table->record[0];
3327   legacy_db_type db_type = ha_legacy_type(m_part_info->default_engine_type);
3328   if(db_type == DB_TYPE_HEAP)
3329   {
3330    m_rec_length= table_share->rec_buff_length;
3331   }
3332   else {
3333    m_rec_length= table_share->rec_buff_length;
3334   }
3335   DBUG_ASSERT(db_type !=  DB_TYPE_UNKNOWN);
3336 
3337   if (!m_part_ids_sorted_by_num_of_records)
3338   {
3339     if (!(m_part_ids_sorted_by_num_of_records=
3340             (uint32*) my_malloc(m_tot_parts * sizeof(uint32), MYF(MY_WME))))
3341       DBUG_RETURN(error);
3342     uint32 i;
3343     /* Initialize it with all partition ids. */
3344     for (i= 0; i < m_tot_parts; i++)
3345       m_part_ids_sorted_by_num_of_records[i]= i;
3346   }
3347 
3348   if (init_partition_bitmaps())
3349     DBUG_RETURN(error);
3350 
3351   DBUG_ASSERT(m_part_info);
3352 
3353   if (m_is_clone_of)
3354   {
3355     uint i, alloc_len;
3356     DBUG_ASSERT(m_clone_mem_root);
3357     /* Allocate an array of handler pointers for the partitions handlers. */
3358     alloc_len= (m_tot_parts + 1) * sizeof(handler*);
3359     if (!(m_file= (handler **) alloc_root(m_clone_mem_root, alloc_len)))
3360     {
3361       error= HA_ERR_INITIALIZATION;
3362       goto err_alloc;
3363     }
3364     memset(m_file, 0, alloc_len);
3365     /*
3366       Populate them by cloning the original partitions. This also opens them.
3367       Note that file->ref is allocated too.
3368     */
3369     file= m_is_clone_of->m_file;
3370     for (i= 0; i < m_tot_parts; i++)
3371     {
3372       if ((error= create_partition_name(name_buff, name, name_buffer_ptr,
3373                                         NORMAL_PART_NAME, FALSE)))
3374       {
3375         file= &m_file[i];
3376         goto err_handler;
3377       }
3378 
3379       /* ::clone() will also set ha_share from the original. */
3380       if (!(m_file[i]= file[i]->clone(name_buff, m_clone_mem_root)))
3381       {
3382         error= HA_ERR_INITIALIZATION;
3383         file= &m_file[i];
3384         goto err_handler;
3385       }
3386       name_buffer_ptr+= strlen(name_buffer_ptr) + 1;
3387     }
3388   }
3389   else
3390   {
3391    file= m_file;
3392    do
3393    {
3394       if ((error= create_partition_name(name_buff, name, name_buffer_ptr,
3395                                         NORMAL_PART_NAME, FALSE)))
3396         goto err_handler;
3397 
3398       if ((error= (*file)->ha_open(table, name_buff, mode,
3399                                    test_if_locked | HA_OPEN_NO_PSI_CALL)))
3400         goto err_handler;
3401       if (m_file == file)
3402         m_num_locks= (*file)->lock_count();
3403       DBUG_ASSERT(m_num_locks == (*file)->lock_count());
3404       name_buffer_ptr+= strlen(name_buffer_ptr) + 1;
3405     } while (*(++file));
3406   }
3407 
3408   file= m_file;
3409   ref_length= (*file)->ref_length;
3410   check_table_flags= (((*file)->ha_table_flags() &
3411                        ~(PARTITION_DISABLED_TABLE_FLAGS)) |
3412                       (PARTITION_ENABLED_TABLE_FLAGS));
3413   while (*(++file))
3414   {
3415     /* MyISAM can have smaller ref_length for partitions with MAX_ROWS set */
3416     set_if_bigger(ref_length, ((*file)->ref_length));
3417     /*
3418       Verify that all partitions have the same set of table flags.
3419       Mask all flags that partitioning enables/disables.
3420     */
3421     if (check_table_flags != (((*file)->ha_table_flags() &
3422                                ~(PARTITION_DISABLED_TABLE_FLAGS)) |
3423                               (PARTITION_ENABLED_TABLE_FLAGS)))
3424     {
3425       error= HA_ERR_INITIALIZATION;
3426       /* set file to last handler, so all of them are closed */
3427       file = &m_file[m_tot_parts - 1];
3428       goto err_handler;
3429     }
3430   }
3431   key_used_on_scan= m_file[0]->key_used_on_scan;
3432   implicit_emptied= m_file[0]->implicit_emptied;
3433   /*
3434     Add 2 bytes for partition id in position ref length.
3435     ref_length=max_in_all_partitions(ref_length) + PARTITION_BYTES_IN_POS
3436   */
3437   ref_length+= PARTITION_BYTES_IN_POS;
3438   m_ref_length= ref_length;
3439 
3440   /*
3441     Release buffer read from .par file. It will not be reused again after
3442     being opened once.
3443   */
3444   clear_handler_file();
3445 
3446   /*
3447     Some handlers update statistics as part of the open call. This will in
3448     some cases corrupt the statistics of the partition handler and thus
3449     to ensure we have correct statistics we call info from open after
3450     calling open on all individual handlers.
3451   */
3452   m_handler_status= handler_opened;
3453   if (m_part_info->part_expr)
3454     m_part_func_monotonicity_info=
3455                             m_part_info->part_expr->get_monotonicity_info();
3456   else if (m_part_info->list_of_part_fields)
3457     m_part_func_monotonicity_info= MONOTONIC_STRICT_INCREASING;
3458   info(HA_STATUS_VARIABLE | HA_STATUS_CONST);
3459   DBUG_RETURN(0);
3460 
3461 err_handler:
3462   DEBUG_SYNC(ha_thd(), "partition_open_error");
3463   while (file-- != m_file)
3464     (*file)->ha_close();
3465 err_alloc:
3466   free_partition_bitmaps();
3467 
3468   DBUG_RETURN(error);
3469 }
3470 
3471 
3472 /*
3473   Disabled since it is not possible to prune yet.
3474   without pruning, it need to rebind/unbind every partition in every
3475   statement which uses a table from the table cache. Will also use
3476   as many PSI_tables as there are partitions.
3477 */
3478 #ifdef HAVE_M_PSI_PER_PARTITION
unbind_psi()3479 void ha_partition::unbind_psi()
3480 {
3481   uint i;
3482 
3483   DBUG_ENTER("ha_partition::unbind_psi");
3484   handler::unbind_psi();
3485   for (i= 0; i < m_tot_parts; i++)
3486   {
3487     DBUG_ASSERT(m_file[i] != NULL);
3488     m_file[i]->unbind_psi();
3489   }
3490   DBUG_VOID_RETURN;
3491 }
3492 
rebind_psi()3493 void ha_partition::rebind_psi()
3494 {
3495   uint i;
3496 
3497   DBUG_ENTER("ha_partition::rebind_psi");
3498   handler::rebind_psi();
3499   for (i= 0; i < m_tot_parts; i++)
3500   {
3501     DBUG_ASSERT(m_file[i] != NULL);
3502     m_file[i]->rebind_psi();
3503   }
3504   DBUG_VOID_RETURN;
3505 }
3506 #endif /* HAVE_M_PSI_PER_PARTITION */
3507 
3508 
3509 /**
3510   Clone the open and locked partitioning handler.
3511 
3512   @param  mem_root  MEM_ROOT to use.
3513 
3514   @return Pointer to the successfully created clone or NULL
3515 
3516   @details
3517   This function creates a new ha_partition handler as a clone/copy. The
3518   original (this) must already be opened and locked. The clone will use
3519   the originals m_part_info.
3520   It also allocates memory for ref + ref_dup.
3521   In ha_partition::open() it will clone its original handlers partitions
3522   which will allocate then on the correct MEM_ROOT and also open them.
3523 */
3524 
clone(const char * name,MEM_ROOT * mem_root)3525 handler *ha_partition::clone(const char *name, MEM_ROOT *mem_root)
3526 {
3527   ha_partition *new_handler;
3528 
3529   DBUG_ENTER("ha_partition::clone");
3530   new_handler= new (mem_root) ha_partition(ht, table_share, m_part_info,
3531                                            this, mem_root);
3532   if (!new_handler)
3533     DBUG_RETURN(NULL);
3534 
3535   /*
3536     We will not clone each partition's handler here, it will be done in
3537     ha_partition::open() for clones. Also set_ha_share_ref is not needed
3538     here, since 1) ha_share is copied in the constructor used above
3539     2) each partition's cloned handler will set it from its original.
3540   */
3541 
3542   /*
3543     Allocate new_handler->ref here because otherwise ha_open will allocate it
3544     on this->table->mem_root and we will not be able to reclaim that memory
3545     when the clone handler object is destroyed.
3546   */
3547   if (!(new_handler->ref= (uchar*) alloc_root(mem_root,
3548                                               ALIGN_SIZE(m_ref_length)*2)))
3549     goto err;
3550 
3551   if (new_handler->ha_open(table, name,
3552                            table->db_stat,
3553                            HA_OPEN_IGNORE_IF_LOCKED | HA_OPEN_NO_PSI_CALL))
3554     goto err;
3555 
3556   DBUG_RETURN((handler*) new_handler);
3557 
3558 err:
3559   delete new_handler;
3560   DBUG_RETURN(NULL);
3561 }
3562 
3563 
3564 /*
3565   Close handler object
3566 
3567   SYNOPSIS
3568     close()
3569 
3570   RETURN VALUE
3571     >0                   Error code
3572     0                    Success
3573 
3574   DESCRIPTION
3575     Called from sql_base.cc, sql_select.cc, and table.cc.
3576     In sql_select.cc it is only used to close up temporary tables or during
3577     the process where a temporary table is converted over to being a
3578     myisam table.
3579     For sql_base.cc look at close_data_tables().
3580 */
3581 
close(void)3582 int ha_partition::close(void)
3583 {
3584   bool first= TRUE;
3585   handler **file;
3586   DBUG_ENTER("ha_partition::close");
3587 
3588   DBUG_ASSERT(table->s == table_share);
3589   destroy_record_priority_queue();
3590   free_partition_bitmaps();
3591   DBUG_ASSERT(m_part_info);
3592   file= m_file;
3593 
3594 repeat:
3595   do
3596   {
3597     (*file)->ha_close();
3598   } while (*(++file));
3599 
3600   if (first && m_added_file && m_added_file[0])
3601   {
3602     file= m_added_file;
3603     first= FALSE;
3604     goto repeat;
3605   }
3606 
3607   m_handler_status= handler_closed;
3608   DBUG_RETURN(0);
3609 }
3610 
3611 /****************************************************************************
3612                 MODULE start/end statement
3613 ****************************************************************************/
3614 /*
3615   A number of methods to define various constants for the handler. In
3616   the case of the partition handler we need to use some max and min
3617   of the underlying handlers in most cases.
3618 */
3619 
3620 /*
3621   Set external locks on table
3622 
3623   SYNOPSIS
3624     external_lock()
3625     thd                    Thread object
3626     lock_type              Type of external lock
3627 
3628   RETURN VALUE
3629     >0                   Error code
3630     0                    Success
3631 
3632   DESCRIPTION
3633     First you should go read the section "locking functions for mysql" in
3634     lock.cc to understand this.
3635     This create a lock on the table. If you are implementing a storage engine
3636     that can handle transactions look at ha_berkeley.cc to see how you will
3637     want to go about doing this. Otherwise you should consider calling
3638     flock() here.
3639     Originally this method was used to set locks on file level to enable
3640     several MySQL Servers to work on the same data. For transactional
3641     engines it has been "abused" to also mean start and end of statements
3642     to enable proper rollback of statements and transactions. When LOCK
3643     TABLES has been issued the start_stmt method takes over the role of
3644     indicating start of statement but in this case there is no end of
3645     statement indicator(?).
3646 
3647     Called from lock.cc by lock_external() and unlock_external(). Also called
3648     from sql_table.cc by copy_data_between_tables().
3649 */
3650 
external_lock(THD * thd,int lock_type)3651 int ha_partition::external_lock(THD *thd, int lock_type)
3652 {
3653   uint error;
3654   uint i, first_used_partition;
3655   MY_BITMAP *used_partitions;
3656   DBUG_ENTER("ha_partition::external_lock");
3657 
3658   DBUG_ASSERT(!auto_increment_lock && !auto_increment_safe_stmt_log_lock);
3659 
3660   if (lock_type == F_UNLCK)
3661     used_partitions= &m_locked_partitions;
3662   else
3663     used_partitions= &(m_part_info->lock_partitions);
3664 
3665   first_used_partition= bitmap_get_first_set(used_partitions);
3666 
3667   for (i= first_used_partition;
3668        i < m_tot_parts;
3669        i= bitmap_get_next_set(used_partitions, i))
3670   {
3671     DBUG_PRINT("info", ("external_lock(thd, %d) part %d", lock_type, i));
3672     if ((error= m_file[i]->ha_external_lock(thd, lock_type)))
3673     {
3674       if (lock_type != F_UNLCK)
3675         goto err_handler;
3676     }
3677     DBUG_PRINT("info", ("external_lock part %u lock %d", i, lock_type));
3678     if (lock_type != F_UNLCK)
3679       bitmap_set_bit(&m_locked_partitions, i);
3680   }
3681   if (lock_type == F_UNLCK)
3682   {
3683     bitmap_clear_all(used_partitions);
3684   }
3685   else
3686   {
3687     /* Add touched partitions to be included in reset(). */
3688     bitmap_union(&m_partitions_to_reset, used_partitions);
3689   }
3690 
3691   if (m_added_file && m_added_file[0])
3692   {
3693     handler **file= m_added_file;
3694     DBUG_ASSERT(lock_type == F_UNLCK);
3695     do
3696     {
3697       (void) (*file)->ha_external_lock(thd, lock_type);
3698     } while (*(++file));
3699   }
3700   DBUG_RETURN(0);
3701 
3702 err_handler:
3703   uint j;
3704   for (j= first_used_partition;
3705        j < i;
3706        j= bitmap_get_next_set(&m_locked_partitions, j))
3707   {
3708     (void) m_file[j]->ha_external_lock(thd, F_UNLCK);
3709   }
3710   bitmap_clear_all(&m_locked_partitions);
3711   DBUG_RETURN(error);
3712 }
3713 
3714 
3715 /*
3716   Get the lock(s) for the table and perform conversion of locks if needed
3717 
3718   SYNOPSIS
3719     store_lock()
3720     thd                   Thread object
3721     to                    Lock object array
3722     lock_type             Table lock type
3723 
3724   RETURN VALUE
3725     >0                   Error code
3726     0                    Success
3727 
3728   DESCRIPTION
3729     The idea with handler::store_lock() is the following:
3730 
3731     The statement decided which locks we should need for the table
3732     for updates/deletes/inserts we get WRITE locks, for SELECT... we get
3733     read locks.
3734 
3735     Before adding the lock into the table lock handler (see thr_lock.c)
3736     mysqld calls store lock with the requested locks.  Store lock can now
3737     modify a write lock to a read lock (or some other lock), ignore the
3738     lock (if we don't want to use MySQL table locks at all) or add locks
3739     for many tables (like we do when we are using a MERGE handler).
3740 
3741     Berkeley DB for partition  changes all WRITE locks to TL_WRITE_ALLOW_WRITE
3742     (which signals that we are doing WRITES, but we are still allowing other
3743     reader's and writer's.
3744 
3745     When releasing locks, store_lock() is also called. In this case one
3746     usually doesn't have to do anything.
3747 
3748     store_lock is called when holding a global mutex to ensure that only
3749     one thread at a time changes the locking information of tables.
3750 
3751     In some exceptional cases MySQL may send a request for a TL_IGNORE;
3752     This means that we are requesting the same lock as last time and this
3753     should also be ignored. (This may happen when someone does a flush
3754     table when we have opened a part of the tables, in which case mysqld
3755     closes and reopens the tables and tries to get the same locks as last
3756     time).  In the future we will probably try to remove this.
3757 
3758     Called from lock.cc by get_lock_data().
3759 */
3760 
store_lock(THD * thd,THR_LOCK_DATA ** to,enum thr_lock_type lock_type)3761 THR_LOCK_DATA **ha_partition::store_lock(THD *thd,
3762 					 THR_LOCK_DATA **to,
3763 					 enum thr_lock_type lock_type)
3764 {
3765   uint i;
3766   DBUG_ENTER("ha_partition::store_lock");
3767   DBUG_ASSERT(thd == current_thd);
3768 
3769   /*
3770     This can be called from get_lock_data() in mysql_lock_abort_for_thread(),
3771     even when thd != table->in_use. In that case don't use partition pruning,
3772     but use all partitions instead to avoid using another threads structures.
3773   */
3774   if (thd != table->in_use)
3775   {
3776     for (i= 0; i < m_tot_parts; i++)
3777       to= m_file[i]->store_lock(thd, to, lock_type);
3778   }
3779   else
3780   {
3781     for (i= bitmap_get_first_set(&(m_part_info->lock_partitions));
3782          i < m_tot_parts;
3783          i= bitmap_get_next_set(&m_part_info->lock_partitions, i))
3784     {
3785       DBUG_PRINT("info", ("store lock %d iteration", i));
3786       to= m_file[i]->store_lock(thd, to, lock_type);
3787     }
3788   }
3789   DBUG_RETURN(to);
3790 }
3791 
3792 /*
3793   Start a statement when table is locked
3794 
3795   SYNOPSIS
3796     start_stmt()
3797     thd                  Thread object
3798     lock_type            Type of external lock
3799 
3800   RETURN VALUE
3801     >0                   Error code
3802     0                    Success
3803 
3804   DESCRIPTION
3805     This method is called instead of external lock when the table is locked
3806     before the statement is executed.
3807 */
3808 
start_stmt(THD * thd,thr_lock_type lock_type)3809 int ha_partition::start_stmt(THD *thd, thr_lock_type lock_type)
3810 {
3811   int error= 0;
3812   uint i;
3813   /* Assert that read_partitions is included in lock_partitions */
3814   DBUG_ASSERT(bitmap_is_subset(&m_part_info->read_partitions,
3815                                &m_part_info->lock_partitions));
3816   /*
3817     m_locked_partitions is set in previous external_lock/LOCK TABLES.
3818     Current statement's lock requests must not include any partitions
3819     not previously locked.
3820   */
3821   DBUG_ASSERT(bitmap_is_subset(&m_part_info->lock_partitions,
3822                                &m_locked_partitions));
3823   DBUG_ENTER("ha_partition::start_stmt");
3824 
3825   for (i= bitmap_get_first_set(&(m_part_info->lock_partitions));
3826        i < m_tot_parts;
3827        i= bitmap_get_next_set(&m_part_info->lock_partitions, i))
3828   {
3829     if ((error= m_file[i]->start_stmt(thd, lock_type)))
3830       break;
3831     /* Add partition to be called in reset(). */
3832     bitmap_set_bit(&m_partitions_to_reset, i);
3833   }
3834   DBUG_RETURN(error);
3835 }
3836 
3837 
3838 /**
3839   Get number of lock objects returned in store_lock
3840 
3841   @returns Number of locks returned in call to store_lock
3842 
3843   @desc
3844     Returns the number of store locks needed in call to store lock.
3845     We return number of partitions we will lock multiplied with number of
3846     locks needed by each partition. Assists the above functions in allocating
3847     sufficient space for lock structures.
3848 */
3849 
lock_count() const3850 uint ha_partition::lock_count() const
3851 {
3852   DBUG_ENTER("ha_partition::lock_count");
3853   /*
3854     The caller want to know the upper bound, to allocate enough memory.
3855     There is no performance lost if we simply return maximum number locks
3856     needed, only some minor over allocation of memory in get_lock_data().
3857 
3858     Also notice that this may be called for another thread != table->in_use,
3859     when mysql_lock_abort_for_thread() is called. So this is more safe, then
3860     using number of partitions after pruning.
3861   */
3862   DBUG_RETURN(m_tot_parts * m_num_locks);
3863 }
3864 
3865 
3866 /*
3867   Unlock last accessed row
3868 
3869   SYNOPSIS
3870     unlock_row()
3871 
3872   RETURN VALUE
3873     NONE
3874 
3875   DESCRIPTION
3876     Record currently processed was not in the result set of the statement
3877     and is thus unlocked. Used for UPDATE and DELETE queries.
3878 */
3879 
unlock_row()3880 void ha_partition::unlock_row()
3881 {
3882   DBUG_ENTER("ha_partition::unlock_row");
3883   m_file[m_last_part]->unlock_row();
3884   DBUG_VOID_RETURN;
3885 }
3886 
3887 /**
3888   Check if semi consistent read was used
3889 
3890   SYNOPSIS
3891     was_semi_consistent_read()
3892 
3893   RETURN VALUE
3894     TRUE   Previous read was a semi consistent read
3895     FALSE  Previous read was not a semi consistent read
3896 
3897   DESCRIPTION
3898     See handler.h:
3899     In an UPDATE or DELETE, if the row under the cursor was locked by another
3900     transaction, and the engine used an optimistic read of the last
3901     committed row value under the cursor, then the engine returns 1 from this
3902     function. MySQL must NOT try to update this optimistic value. If the
3903     optimistic value does not match the WHERE condition, MySQL can decide to
3904     skip over this row. Currently only works for InnoDB. This can be used to
3905     avoid unnecessary lock waits.
3906 
3907     If this method returns nonzero, it will also signal the storage
3908     engine that the next read will be a locking re-read of the row.
3909 */
was_semi_consistent_read()3910 bool ha_partition::was_semi_consistent_read()
3911 {
3912   DBUG_ENTER("ha_partition::was_semi_consistent_read");
3913   DBUG_ASSERT(m_last_part < m_tot_parts &&
3914               bitmap_is_set(&(m_part_info->read_partitions), m_last_part));
3915   DBUG_RETURN(m_file[m_last_part]->was_semi_consistent_read());
3916 }
3917 
3918 /**
3919   Use semi consistent read if possible
3920 
3921   SYNOPSIS
3922     try_semi_consistent_read()
3923     yes   Turn on semi consistent read
3924 
3925   RETURN VALUE
3926     NONE
3927 
3928   DESCRIPTION
3929     See handler.h:
3930     Tell the engine whether it should avoid unnecessary lock waits.
3931     If yes, in an UPDATE or DELETE, if the row under the cursor was locked
3932     by another transaction, the engine may try an optimistic read of
3933     the last committed row value under the cursor.
3934     Note: prune_partitions are already called before this call, so using
3935     pruning is OK.
3936 */
try_semi_consistent_read(bool yes)3937 void ha_partition::try_semi_consistent_read(bool yes)
3938 {
3939   uint i;
3940   DBUG_ENTER("ha_partition::try_semi_consistent_read");
3941 
3942   i= bitmap_get_first_set(&(m_part_info->read_partitions));
3943   DBUG_ASSERT(i != MY_BIT_NONE);
3944   for (;
3945        i < m_tot_parts;
3946        i= bitmap_get_next_set(&m_part_info->read_partitions, i))
3947   {
3948     m_file[i]->try_semi_consistent_read(yes);
3949   }
3950   DBUG_VOID_RETURN;
3951 }
3952 
3953 
3954 /****************************************************************************
3955                 MODULE change record
3956 ****************************************************************************/
3957 
3958 /*
3959   Insert a row to the table
3960 
3961   SYNOPSIS
3962     write_row()
3963     buf                        The row in MySQL Row Format
3964 
3965   RETURN VALUE
3966     >0                         Error code
3967     0                          Success
3968 
3969   DESCRIPTION
3970     write_row() inserts a row. buf() is a byte array of data, normally
3971     record[0].
3972 
3973     You can use the field information to extract the data from the native byte
3974     array type.
3975 
3976     Example of this would be:
3977     for (Field **field=table->field ; *field ; field++)
3978     {
3979       ...
3980     }
3981 
3982     See ha_tina.cc for a variant of extracting all of the data as strings.
3983     ha_berkeley.cc has a variant of how to store it intact by "packing" it
3984     for ha_berkeley's own native storage type.
3985 
3986     Called from item_sum.cc, item_sum.cc, sql_acl.cc, sql_insert.cc,
3987     sql_insert.cc, sql_select.cc, sql_table.cc, sql_udf.cc, and sql_update.cc.
3988 
3989 */
3990 
write_row(uchar * buf)3991 int ha_partition::write_row(uchar * buf)
3992 {
3993   uint32 part_id;
3994   int error;
3995   longlong func_value;
3996   bool have_auto_increment= table->next_number_field && buf == table->record[0];
3997   my_bitmap_map *old_map;
3998   THD *thd= ha_thd();
3999   sql_mode_t saved_sql_mode= thd->variables.sql_mode;
4000   bool saved_auto_inc_field_not_null= table->auto_increment_field_not_null;
4001   DBUG_ENTER("ha_partition::write_row");
4002   DBUG_ASSERT(buf == m_rec0);
4003 
4004   /*
4005     If we have an auto_increment column and we are writing a changed row
4006     or a new row, then update the auto_increment value in the record.
4007   */
4008   if (have_auto_increment)
4009   {
4010     if (!part_share->auto_inc_initialized &&
4011         !table_share->next_number_keypart)
4012     {
4013       /*
4014         If auto_increment in table_share is not initialized, start by
4015         initializing it.
4016       */
4017       info(HA_STATUS_AUTO);
4018     }
4019     error= update_auto_increment();
4020 
4021     /*
4022       If we have failed to set the auto-increment value for this row,
4023       it is highly likely that we will not be able to insert it into
4024       the correct partition. We must check and fail if neccessary.
4025     */
4026     if (error)
4027       goto exit;
4028 
4029     /*
4030       Don't allow generation of auto_increment value the partitions handler.
4031       If a partitions handler would change the value, then it might not
4032       match the partition any longer.
4033       This can occur if 'SET INSERT_ID = 0; INSERT (NULL)',
4034       So allow this by adding 'MODE_NO_AUTO_VALUE_ON_ZERO' to sql_mode.
4035       The partitions handler::next_insert_id must always be 0. Otherwise
4036       we need to forward release_auto_increment, or reset it for all
4037       partitions.
4038     */
4039     if (table->next_number_field->val_int() == 0)
4040     {
4041       table->auto_increment_field_not_null= TRUE;
4042       thd->variables.sql_mode|= MODE_NO_AUTO_VALUE_ON_ZERO;
4043     }
4044   }
4045 
4046   old_map= dbug_tmp_use_all_columns(table, table->read_set);
4047   error= m_part_info->get_partition_id(m_part_info, &part_id, &func_value);
4048   dbug_tmp_restore_column_map(table->read_set, old_map);
4049   if (unlikely(error))
4050   {
4051     m_part_info->err_value= func_value;
4052     goto exit;
4053   }
4054   if (!bitmap_is_set(&(m_part_info->lock_partitions), part_id))
4055   {
4056     DBUG_PRINT("info", ("Write to non-locked partition %u (func_value: %ld)",
4057                         part_id, (long) func_value));
4058     error= HA_ERR_NOT_IN_LOCK_PARTITIONS;
4059     goto exit;
4060   }
4061   m_last_part= part_id;
4062   DBUG_PRINT("info", ("Insert in partition %d", part_id));
4063   start_part_bulk_insert(thd, part_id);
4064 
4065   tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
4066   error= m_file[part_id]->ha_write_row(buf);
4067   if (have_auto_increment && !table->s->next_number_keypart)
4068     set_auto_increment_if_higher(table->next_number_field);
4069   reenable_binlog(thd);
4070 exit:
4071   thd->variables.sql_mode= saved_sql_mode;
4072   table->auto_increment_field_not_null= saved_auto_inc_field_not_null;
4073   DBUG_RETURN(error);
4074 }
4075 
4076 
4077 /*
4078   Update an existing row
4079 
4080   SYNOPSIS
4081     update_row()
4082     old_data                 Old record in MySQL Row Format
4083     new_data                 New record in MySQL Row Format
4084 
4085   RETURN VALUE
4086     >0                         Error code
4087     0                          Success
4088 
4089   DESCRIPTION
4090     Yes, update_row() does what you expect, it updates a row. old_data will
4091     have the previous row record in it, while new_data will have the newest
4092     data in it.
4093     Keep in mind that the server can do updates based on ordering if an
4094     ORDER BY clause was used. Consecutive ordering is not guarenteed.
4095 
4096     Called from sql_select.cc, sql_acl.cc, sql_update.cc, and sql_insert.cc.
4097     new_data is always record[0]
4098     old_data is always record[1]
4099 */
4100 
update_row(const uchar * old_data,uchar * new_data)4101 int ha_partition::update_row(const uchar *old_data, uchar *new_data)
4102 {
4103   THD *thd= ha_thd();
4104   uint32 new_part_id, old_part_id;
4105   int error= 0;
4106   longlong func_value;
4107   DBUG_ENTER("ha_partition::update_row");
4108   m_err_rec= NULL;
4109 
4110   // Need to read partition-related columns, to locate the row's partition:
4111   DBUG_ASSERT(bitmap_is_subset(&m_part_info->full_part_field_set,
4112                                table->read_set));
4113   if ((error= get_parts_for_update(old_data, new_data, table->record[0],
4114                                    m_part_info, &old_part_id, &new_part_id,
4115                                    &func_value)))
4116   {
4117     m_part_info->err_value= func_value;
4118     goto exit;
4119   }
4120   DBUG_ASSERT(bitmap_is_set(&(m_part_info->read_partitions), old_part_id));
4121   if (!bitmap_is_set(&(m_part_info->lock_partitions), new_part_id))
4122   {
4123     error= HA_ERR_NOT_IN_LOCK_PARTITIONS;
4124     goto exit;
4125   }
4126 
4127   /*
4128     The protocol for updating a row is:
4129     1) position the handler (cursor) on the row to be updated,
4130        either through the last read row (rnd or index) or by rnd_pos.
4131     2) call update_row with both old and new full records as arguments.
4132 
4133     This means that m_last_part should already be set to actual partition
4134     where the row was read from. And if that is not the same as the
4135     calculated part_id we found a misplaced row, we return an error to
4136     notify the user that something is broken in the row distribution
4137     between partitions! Since we don't check all rows on read, we return an
4138     error instead of correcting m_last_part, to make the user aware of the
4139     problem!
4140 
4141     Notice that HA_READ_BEFORE_WRITE_REMOVAL does not require this protocol,
4142     so this is not supported for this engine.
4143   */
4144   if (old_part_id != m_last_part)
4145   {
4146     m_err_rec= old_data;
4147     DBUG_RETURN(HA_ERR_ROW_IN_WRONG_PARTITION);
4148   }
4149 
4150   m_last_part= new_part_id;
4151   start_part_bulk_insert(thd, new_part_id);
4152   if (new_part_id == old_part_id)
4153   {
4154     DBUG_PRINT("info", ("Update in partition %d", new_part_id));
4155     tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
4156     error= m_file[new_part_id]->ha_update_row(old_data, new_data);
4157     reenable_binlog(thd);
4158     goto exit;
4159   }
4160   else
4161   {
4162     Field *saved_next_number_field= table->next_number_field;
4163     /*
4164       Don't allow generation of auto_increment value for update.
4165       table->next_number_field is never set on UPDATE.
4166       But is set for INSERT ... ON DUPLICATE KEY UPDATE,
4167       and since update_row() does not generate or update an auto_inc value,
4168       we cannot have next_number_field set when moving a row
4169       to another partition with write_row(), since that could
4170       generate/update the auto_inc value.
4171       This gives the same behavior for partitioned vs non partitioned tables.
4172     */
4173     table->next_number_field= NULL;
4174     DBUG_PRINT("info", ("Update from partition %d to partition %d",
4175 			old_part_id, new_part_id));
4176     tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
4177     error= m_file[new_part_id]->ha_write_row(new_data);
4178     reenable_binlog(thd);
4179     table->next_number_field= saved_next_number_field;
4180     if (error)
4181       goto exit;
4182 
4183     tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
4184     error= m_file[old_part_id]->ha_delete_row(old_data);
4185     reenable_binlog(thd);
4186     if (error)
4187     {
4188 #ifdef IN_THE_FUTURE
4189       (void) m_file[new_part_id]->delete_last_inserted_row(new_data);
4190 #endif
4191       goto exit;
4192     }
4193   }
4194 
4195 exit:
4196   /*
4197     if updating an auto_increment column, update
4198     part_share->next_auto_inc_val if needed.
4199     (not to be used if auto_increment on secondary field in a multi-column
4200     index)
4201     mysql_update does not set table->next_number_field, so we use
4202     table->found_next_number_field instead.
4203     Also checking that the field is marked in the write set.
4204   */
4205   if (table->found_next_number_field &&
4206       new_data == table->record[0] &&
4207       !table->s->next_number_keypart &&
4208       bitmap_is_set(table->write_set,
4209                     table->found_next_number_field->field_index))
4210   {
4211     if (!part_share->auto_inc_initialized)
4212       info(HA_STATUS_AUTO);
4213     set_auto_increment_if_higher(table->found_next_number_field);
4214   }
4215   DBUG_RETURN(error);
4216 }
4217 
4218 
4219 /*
4220   Remove an existing row
4221 
4222   SYNOPSIS
4223     delete_row
4224     buf                      Deleted row in MySQL Row Format
4225 
4226   RETURN VALUE
4227     >0                       Error Code
4228     0                        Success
4229 
4230   DESCRIPTION
4231     This will delete a row. buf will contain a copy of the row to be deleted.
4232     The server will call this right after the current row has been read
4233     (from either a previous rnd_xxx() or index_xxx() call).
4234     If you keep a pointer to the last row or can access a primary key it will
4235     make doing the deletion quite a bit easier.
4236     Keep in mind that the server does no guarentee consecutive deletions.
4237     ORDER BY clauses can be used.
4238 
4239     Called in sql_acl.cc and sql_udf.cc to manage internal table information.
4240     Called in sql_delete.cc, sql_insert.cc, and sql_select.cc. In sql_select
4241     it is used for removing duplicates while in insert it is used for REPLACE
4242     calls.
4243 
4244     buf is either record[0] or record[1]
4245 */
4246 
delete_row(const uchar * buf)4247 int ha_partition::delete_row(const uchar *buf)
4248 {
4249   uint32 part_id;
4250   int error;
4251   THD *thd= ha_thd();
4252   DBUG_ENTER("ha_partition::delete_row");
4253   m_err_rec= NULL;
4254 
4255   DBUG_ASSERT(bitmap_is_subset(&m_part_info->full_part_field_set,
4256                                table->read_set));
4257   if ((error= get_part_for_delete(buf, m_rec0, m_part_info, &part_id)))
4258   {
4259     DBUG_RETURN(error);
4260   }
4261   /* Should never call delete_row on a partition which is not read */
4262   DBUG_ASSERT(bitmap_is_set(&(m_part_info->read_partitions), part_id));
4263   DBUG_ASSERT(bitmap_is_set(&(m_part_info->lock_partitions), part_id));
4264   if (!bitmap_is_set(&(m_part_info->lock_partitions), part_id))
4265     DBUG_RETURN(HA_ERR_NOT_IN_LOCK_PARTITIONS);
4266 
4267   /*
4268     The protocol for deleting a row is:
4269     1) position the handler (cursor) on the row to be deleted,
4270        either through the last read row (rnd or index) or by rnd_pos.
4271     2) call delete_row with the full record as argument.
4272 
4273     This means that m_last_part should already be set to actual partition
4274     where the row was read from. And if that is not the same as the
4275     calculated part_id we found a misplaced row, we return an error to
4276     notify the user that something is broken in the row distribution
4277     between partitions! Since we don't check all rows on read, we return an
4278     error instead of forwarding the delete to the correct (m_last_part)
4279     partition!
4280 
4281     Notice that HA_READ_BEFORE_WRITE_REMOVAL does not require this protocol,
4282     so this is not supported for this engine.
4283 
4284     TODO: change the assert in InnoDB into an error instead and make this one
4285     an assert instead and remove the get_part_for_delete()!
4286   */
4287   if (part_id != m_last_part)
4288   {
4289     m_err_rec= buf;
4290     DBUG_RETURN(HA_ERR_ROW_IN_WRONG_PARTITION);
4291   }
4292 
4293   m_last_part= part_id;
4294   tmp_disable_binlog(thd);
4295   error= m_file[part_id]->ha_delete_row(buf);
4296   reenable_binlog(thd);
4297   DBUG_RETURN(error);
4298 }
4299 
4300 
4301 /*
4302   Delete all rows in a table
4303 
4304   SYNOPSIS
4305     delete_all_rows()
4306 
4307   RETURN VALUE
4308     >0                       Error Code
4309     0                        Success
4310 
4311   DESCRIPTION
4312     Used to delete all rows in a table. Both for cases of truncate and
4313     for cases where the optimizer realizes that all rows will be
4314     removed as a result of a SQL statement.
4315 
4316     Called from item_sum.cc by Item_func_group_concat::clear(),
4317     Item_sum_count_distinct::clear(), and Item_func_group_concat::clear().
4318     Called from sql_delete.cc by mysql_delete().
4319     Called from sql_select.cc by JOIN::reset().
4320     Called from sql_union.cc by st_select_lex_unit::exec().
4321 */
4322 
delete_all_rows()4323 int ha_partition::delete_all_rows()
4324 {
4325   int error;
4326   uint i;
4327   DBUG_ENTER("ha_partition::delete_all_rows");
4328 
4329   for (i= bitmap_get_first_set(&m_part_info->read_partitions);
4330        i < m_tot_parts;
4331        i= bitmap_get_next_set(&m_part_info->read_partitions, i))
4332   {
4333     /* Can be pruned, like DELETE FROM t PARTITION (pX) */
4334     if ((error= m_file[i]->ha_delete_all_rows()))
4335       DBUG_RETURN(error);
4336   }
4337   DBUG_RETURN(0);
4338 }
4339 
4340 
4341 /**
4342   Manually truncate the table.
4343 
4344   @retval  0    Success.
4345   @retval  > 0  Error code.
4346 */
4347 
truncate()4348 int ha_partition::truncate()
4349 {
4350   int error;
4351   handler **file;
4352   DBUG_ENTER("ha_partition::truncate");
4353 
4354   /*
4355     TRUNCATE also means resetting auto_increment. Hence, reset
4356     it so that it will be initialized again at the next use.
4357   */
4358   lock_auto_increment();
4359   part_share->next_auto_inc_val= 0;
4360   part_share->auto_inc_initialized= false;
4361   unlock_auto_increment();
4362 
4363   file= m_file;
4364   do
4365   {
4366     if ((error= (*file)->ha_truncate()))
4367       DBUG_RETURN(error);
4368   } while (*(++file));
4369   DBUG_RETURN(0);
4370 }
4371 
4372 
4373 /**
4374   Truncate a set of specific partitions.
4375 
4376   @remark Auto increment value will be truncated in that partition as well!
4377 
4378   ALTER TABLE t TRUNCATE PARTITION ...
4379 */
4380 
truncate_partition(Alter_info * alter_info,bool * binlog_stmt)4381 int ha_partition::truncate_partition(Alter_info *alter_info, bool *binlog_stmt)
4382 {
4383   int error= 0;
4384   List_iterator<partition_element> part_it(m_part_info->partitions);
4385   uint num_parts= m_part_info->num_parts;
4386   uint num_subparts= m_part_info->num_subparts;
4387   uint i= 0;
4388   DBUG_ENTER("ha_partition::truncate_partition");
4389 
4390   /* Only binlog when it starts any call to the partitions handlers */
4391   *binlog_stmt= false;
4392 
4393   if (set_part_state(alter_info, m_part_info, PART_ADMIN))
4394     DBUG_RETURN(HA_ERR_NO_PARTITION_FOUND);
4395 
4396   /*
4397     TRUNCATE also means resetting auto_increment. Hence, reset
4398     it so that it will be initialized again at the next use.
4399   */
4400   lock_auto_increment();
4401   part_share->next_auto_inc_val= 0;
4402   part_share->auto_inc_initialized= FALSE;
4403   unlock_auto_increment();
4404 
4405   *binlog_stmt= true;
4406 
4407   do
4408   {
4409     partition_element *part_elem= part_it++;
4410     if (part_elem->part_state == PART_ADMIN)
4411     {
4412       if (m_is_sub_partitioned)
4413       {
4414         List_iterator<partition_element>
4415                                     subpart_it(part_elem->subpartitions);
4416         partition_element *sub_elem;
4417         uint j= 0, part;
4418         do
4419         {
4420           sub_elem= subpart_it++;
4421           part= i * num_subparts + j;
4422           DBUG_PRINT("info", ("truncate subpartition %u (%s)",
4423                               part, sub_elem->partition_name));
4424           if ((error= m_file[part]->ha_truncate()))
4425             break;
4426           sub_elem->part_state= PART_NORMAL;
4427         } while (++j < num_subparts);
4428       }
4429       else
4430       {
4431         DBUG_PRINT("info", ("truncate partition %u (%s)", i,
4432                             part_elem->partition_name));
4433         error= m_file[i]->ha_truncate();
4434       }
4435       part_elem->part_state= PART_NORMAL;
4436     }
4437   } while (!error && (++i < num_parts));
4438   if (error)
4439   {
4440     /* Reset to PART_NORMAL. */
4441     set_all_part_state(m_part_info, PART_NORMAL);
4442   }
4443   DBUG_RETURN(error);
4444 }
4445 
4446 
4447 /*
4448   Start a large batch of insert rows
4449 
4450   SYNOPSIS
4451     start_bulk_insert()
4452     rows                  Number of rows to insert
4453 
4454   RETURN VALUE
4455     NONE
4456 
4457   DESCRIPTION
4458     rows == 0 means we will probably insert many rows
4459 */
start_bulk_insert(ha_rows rows)4460 void ha_partition::start_bulk_insert(ha_rows rows)
4461 {
4462   DBUG_ENTER("ha_partition::start_bulk_insert");
4463 
4464   m_bulk_inserted_rows= 0;
4465   bitmap_clear_all(&m_bulk_insert_started);
4466   /* use the last bit for marking if bulk_insert_started was called */
4467   bitmap_set_bit(&m_bulk_insert_started, m_tot_parts);
4468   DBUG_VOID_RETURN;
4469 }
4470 
4471 
4472 /*
4473   Check if start_bulk_insert has been called for this partition,
4474   if not, call it and mark it called
4475 */
start_part_bulk_insert(THD * thd,uint part_id)4476 void ha_partition::start_part_bulk_insert(THD *thd, uint part_id)
4477 {
4478   long old_buffer_size;
4479   if (!bitmap_is_set(&m_bulk_insert_started, part_id) &&
4480       bitmap_is_set(&m_bulk_insert_started, m_tot_parts))
4481   {
4482     DBUG_ASSERT(bitmap_is_set(&(m_part_info->lock_partitions), part_id));
4483     old_buffer_size= thd->variables.read_buff_size;
4484     /* Update read_buffer_size for this partition */
4485     thd->variables.read_buff_size= estimate_read_buffer_size(old_buffer_size);
4486     m_file[part_id]->ha_start_bulk_insert(guess_bulk_insert_rows());
4487     bitmap_set_bit(&m_bulk_insert_started, part_id);
4488     thd->variables.read_buff_size= old_buffer_size;
4489   }
4490   m_bulk_inserted_rows++;
4491 }
4492 
4493 /*
4494   Estimate the read buffer size for each partition.
4495   SYNOPSIS
4496     ha_partition::estimate_read_buffer_size()
4497     original_size  read buffer size originally set for the server
4498   RETURN VALUE
4499     estimated buffer size.
4500   DESCRIPTION
4501     If the estimated number of rows to insert is less than 10 (but not 0)
4502     the new buffer size is same as original buffer size.
4503     In case of first partition of when partition function is monotonic
4504     new buffer size is same as the original buffer size.
4505     For rest of the partition total buffer of 10*original_size is divided
4506     equally if number of partition is more than 10 other wise each partition
4507     will be allowed to use original buffer size.
4508 */
estimate_read_buffer_size(long original_size)4509 long ha_partition::estimate_read_buffer_size(long original_size)
4510 {
4511   /*
4512     If number of rows to insert is less than 10, but not 0,
4513     return original buffer size.
4514   */
4515   if (estimation_rows_to_insert && (estimation_rows_to_insert < 10))
4516     return (original_size);
4517   /*
4518     If first insert/partition and monotonic partition function,
4519     allow using buffer size originally set.
4520    */
4521   if (!m_bulk_inserted_rows &&
4522       m_part_func_monotonicity_info != NON_MONOTONIC &&
4523       m_tot_parts > 1)
4524     return original_size;
4525   /*
4526     Allow total buffer used in all partition to go up to 10*read_buffer_size.
4527     11*read_buffer_size in case of monotonic partition function.
4528   */
4529 
4530   if (m_tot_parts < 10)
4531       return original_size;
4532   return (original_size * 10 / m_tot_parts);
4533 }
4534 
4535 /*
4536   Try to predict the number of inserts into this partition.
4537 
4538   If less than 10 rows (including 0 which means Unknown)
4539     just give that as a guess
4540   If monotonic partitioning function was used
4541     guess that 50 % of the inserts goes to the first partition
4542   For all other cases, guess on equal distribution between the partitions
4543 */
guess_bulk_insert_rows()4544 ha_rows ha_partition::guess_bulk_insert_rows()
4545 {
4546   DBUG_ENTER("guess_bulk_insert_rows");
4547 
4548   if (estimation_rows_to_insert < 10)
4549     DBUG_RETURN(estimation_rows_to_insert);
4550 
4551   /* If first insert/partition and monotonic partition function, guess 50%.  */
4552   if (!m_bulk_inserted_rows &&
4553       m_part_func_monotonicity_info != NON_MONOTONIC &&
4554       m_tot_parts > 1)
4555     DBUG_RETURN(estimation_rows_to_insert / 2);
4556 
4557   /* Else guess on equal distribution (+1 is to avoid returning 0/Unknown) */
4558   if (m_bulk_inserted_rows < estimation_rows_to_insert)
4559     DBUG_RETURN(((estimation_rows_to_insert - m_bulk_inserted_rows)
4560                 / m_tot_parts) + 1);
4561   /* The estimation was wrong, must say 'Unknown' */
4562   DBUG_RETURN(0);
4563 }
4564 
4565 
4566 /**
4567   Finish a large batch of insert rows.
4568 
4569   @return Operation status.
4570     @retval     0 Success
4571     @retval  != 0 Error code
4572 */
4573 
end_bulk_insert()4574 int ha_partition::end_bulk_insert()
4575 {
4576   int error= 0;
4577   uint i;
4578   DBUG_ENTER("ha_partition::end_bulk_insert");
4579 
4580   if (!bitmap_is_set(&m_bulk_insert_started, m_tot_parts))
4581   {
4582     DBUG_ASSERT(0);
4583     DBUG_RETURN(error);
4584   }
4585 
4586   for (i= bitmap_get_first_set(&m_bulk_insert_started);
4587        i < m_tot_parts;
4588        i= bitmap_get_next_set(&m_bulk_insert_started, i))
4589   {
4590     int tmp;
4591     if ((tmp= m_file[i]->ha_end_bulk_insert()))
4592       error= tmp;
4593   }
4594   bitmap_clear_all(&m_bulk_insert_started);
4595   DBUG_RETURN(error);
4596 }
4597 
4598 
4599 /****************************************************************************
4600                 MODULE full table scan
4601 ****************************************************************************/
4602 /*
4603   Initialize engine for random reads
4604 
4605   SYNOPSIS
4606     ha_partition::rnd_init()
4607     scan	0  Initialize for random reads through rnd_pos()
4608 		1  Initialize for random scan through rnd_next()
4609 
4610   RETURN VALUE
4611     >0          Error code
4612     0           Success
4613 
4614   DESCRIPTION
4615     rnd_init() is called when the server wants the storage engine to do a
4616     table scan or when the server wants to access data through rnd_pos.
4617 
4618     When scan is used we will scan one handler partition at a time.
4619     When preparing for rnd_pos we will init all handler partitions.
4620     No extra cache handling is needed when scannning is not performed.
4621 
4622     Before initialising we will call rnd_end to ensure that we clean up from
4623     any previous incarnation of a table scan.
4624     Called from filesort.cc, records.cc, sql_handler.cc, sql_select.cc,
4625     sql_table.cc, and sql_update.cc.
4626 */
4627 
rnd_init(bool scan)4628 int ha_partition::rnd_init(bool scan)
4629 {
4630   int error;
4631   uint i= 0;
4632   uint32 part_id;
4633   DBUG_ENTER("ha_partition::rnd_init");
4634 
4635   /*
4636     For operations that may need to change data, we may need to extend
4637     read_set.
4638   */
4639   if (get_lock_type() == F_WRLCK)
4640   {
4641     /*
4642       If write_set contains any of the fields used in partition and
4643       subpartition expression, we need to set all bits in read_set because
4644       the row may need to be inserted in a different [sub]partition. In
4645       other words update_row() can be converted into write_row(), which
4646       requires a complete record.
4647     */
4648     if (bitmap_is_overlapping(&m_part_info->full_part_field_set,
4649                               table->write_set))
4650       bitmap_set_all(table->read_set);
4651     else
4652     {
4653       /*
4654         Some handlers only read fields as specified by the bitmap for the
4655         read set. For partitioned handlers we always require that the
4656         fields of the partition functions are read such that we can
4657         calculate the partition id to place updated and deleted records.
4658       */
4659       bitmap_union(table->read_set, &m_part_info->full_part_field_set);
4660     }
4661   }
4662 
4663   /* Now we see what the index of our first important partition is */
4664   DBUG_PRINT("info", ("m_part_info->read_partitions: 0x%lx",
4665                       (long) m_part_info->read_partitions.bitmap));
4666   part_id= bitmap_get_first_set(&(m_part_info->read_partitions));
4667   DBUG_PRINT("info", ("m_part_spec.start_part %d", part_id));
4668 
4669   if (MY_BIT_NONE == part_id)
4670   {
4671     error= 0;
4672     goto err1;
4673   }
4674 
4675   /*
4676     We have a partition and we are scanning with rnd_next
4677     so we bump our cache
4678   */
4679   DBUG_PRINT("info", ("rnd_init on partition %d", part_id));
4680   if (scan)
4681   {
4682     /*
4683       rnd_end() is needed for partitioning to reset internal data if scan
4684       is already in use
4685     */
4686     rnd_end();
4687     late_extra_cache(part_id);
4688     if ((error= m_file[part_id]->ha_rnd_init(scan)))
4689       goto err;
4690   }
4691   else
4692   {
4693     for (i= part_id;
4694          i < m_tot_parts;
4695          i= bitmap_get_next_set(&m_part_info->read_partitions, i))
4696     {
4697       if ((error= m_file[i]->ha_rnd_init(scan)))
4698         goto err;
4699     }
4700   }
4701   m_scan_value= scan;
4702   m_part_spec.start_part= part_id;
4703   m_part_spec.end_part= m_tot_parts - 1;
4704   DBUG_PRINT("info", ("m_scan_value=%d", m_scan_value));
4705   DBUG_RETURN(0);
4706 
4707 err:
4708   /* Call rnd_end for all previously inited partitions. */
4709   for (;
4710        part_id < i;
4711        part_id= bitmap_get_next_set(&m_part_info->read_partitions, part_id))
4712   {
4713     m_file[part_id]->ha_rnd_end();
4714   }
4715 err1:
4716   m_scan_value= 2;
4717   m_part_spec.start_part= NO_CURRENT_PART_ID;
4718   DBUG_RETURN(error);
4719 }
4720 
4721 
4722 /*
4723   End of a table scan
4724 
4725   SYNOPSIS
4726     rnd_end()
4727 
4728   RETURN VALUE
4729     >0          Error code
4730     0           Success
4731 */
4732 
rnd_end()4733 int ha_partition::rnd_end()
4734 {
4735   DBUG_ENTER("ha_partition::rnd_end");
4736   switch (m_scan_value) {
4737   case 2:                                       // Error
4738     break;
4739   case 1:
4740     if (NO_CURRENT_PART_ID != m_part_spec.start_part)         // Table scan
4741     {
4742       late_extra_no_cache(m_part_spec.start_part);
4743       m_file[m_part_spec.start_part]->ha_rnd_end();
4744     }
4745     break;
4746   case 0:
4747     uint i;
4748     for (i= bitmap_get_first_set(&m_part_info->read_partitions);
4749          i < m_tot_parts;
4750          i= bitmap_get_next_set(&m_part_info->read_partitions, i))
4751     {
4752       m_file[i]->ha_rnd_end();
4753     }
4754     break;
4755   }
4756   m_scan_value= 2;
4757   m_part_spec.start_part= NO_CURRENT_PART_ID;
4758   DBUG_RETURN(0);
4759 }
4760 
4761 /*
4762   read next row during full table scan (scan in random row order)
4763 
4764   SYNOPSIS
4765     rnd_next()
4766     buf		buffer that should be filled with data
4767 
4768   RETURN VALUE
4769     >0          Error code
4770     0           Success
4771 
4772   DESCRIPTION
4773     This is called for each row of the table scan. When you run out of records
4774     you should return HA_ERR_END_OF_FILE.
4775     The Field structure for the table is the key to getting data into buf
4776     in a manner that will allow the server to understand it.
4777 
4778     Called from filesort.cc, records.cc, sql_handler.cc, sql_select.cc,
4779     sql_table.cc, and sql_update.cc.
4780 */
4781 
rnd_next(uchar * buf)4782 int ha_partition::rnd_next(uchar *buf)
4783 {
4784   handler *file;
4785   int result= HA_ERR_END_OF_FILE;
4786   uint part_id= m_part_spec.start_part;
4787   DBUG_ENTER("ha_partition::rnd_next");
4788 
4789   if (NO_CURRENT_PART_ID == part_id)
4790   {
4791     /*
4792       The original set of partitions to scan was empty and thus we report
4793       the result here.
4794     */
4795     goto end;
4796   }
4797 
4798   DBUG_ASSERT(m_scan_value == 1);
4799   file= m_file[part_id];
4800 
4801   while (TRUE)
4802   {
4803     result= file->ha_rnd_next(buf);
4804     if (!result)
4805     {
4806       m_last_part= part_id;
4807       m_part_spec.start_part= part_id;
4808       table->status= 0;
4809       DBUG_RETURN(0);
4810     }
4811 
4812     /*
4813       if we get here, then the current partition ha_rnd_next returned failure
4814     */
4815     if (result == HA_ERR_RECORD_DELETED)
4816       continue;                               // Probably MyISAM
4817 
4818     if (result != HA_ERR_END_OF_FILE)
4819       goto end_dont_reset_start_part;         // Return error
4820 
4821     /* End current partition */
4822     late_extra_no_cache(part_id);
4823     DBUG_PRINT("info", ("rnd_end on partition %d", part_id));
4824     if ((result= file->ha_rnd_end()))
4825       break;
4826 
4827     /* Shift to next partition */
4828     part_id= bitmap_get_next_set(&m_part_info->read_partitions, part_id);
4829     if (part_id >= m_tot_parts)
4830     {
4831       result= HA_ERR_END_OF_FILE;
4832       break;
4833     }
4834     m_last_part= part_id;
4835     m_part_spec.start_part= part_id;
4836     file= m_file[part_id];
4837     DBUG_PRINT("info", ("rnd_init on partition %d", part_id));
4838     if ((result= file->ha_rnd_init(1)))
4839       break;
4840     late_extra_cache(part_id);
4841   }
4842 
4843 end:
4844   m_part_spec.start_part= NO_CURRENT_PART_ID;
4845 end_dont_reset_start_part:
4846   table->status= STATUS_NOT_FOUND;
4847   DBUG_RETURN(result);
4848 }
4849 
4850 
4851 /*
4852   Save position of current row
4853 
4854   SYNOPSIS
4855     position()
4856     record             Current record in MySQL Row Format
4857 
4858   RETURN VALUE
4859     NONE
4860 
4861   DESCRIPTION
4862     position() is called after each call to rnd_next() if the data needs
4863     to be ordered. You can do something like the following to store
4864     the position:
4865     ha_store_ptr(ref, ref_length, current_position);
4866 
4867     The server uses ref to store data. ref_length in the above case is
4868     the size needed to store current_position. ref is just a byte array
4869     that the server will maintain. If you are using offsets to mark rows, then
4870     current_position should be the offset. If it is a primary key like in
4871     BDB, then it needs to be a primary key.
4872 
4873     Called from filesort.cc, sql_select.cc, sql_delete.cc and sql_update.cc.
4874 */
4875 
position(const uchar * record)4876 void ha_partition::position(const uchar *record)
4877 {
4878   handler *file= m_file[m_last_part];
4879   uint pad_length;
4880   DBUG_ASSERT(bitmap_is_set(&(m_part_info->read_partitions), m_last_part));
4881   DBUG_ENTER("ha_partition::position");
4882 
4883   int2store(ref, m_last_part);
4884   /*
4885     If m_sec_sort_by_rowid is set, then the ref is already stored in the
4886     priority queue (m_queue) when doing ordered scans.
4887   */
4888   if (m_sec_sort_by_rowid && m_ordered_scan_ongoing)
4889   {
4890     DBUG_ASSERT(m_queue.elements);
4891     DBUG_ASSERT(m_ordered_rec_buffer);
4892     DBUG_ASSERT(!m_curr_key_info[1]);
4893     /* We already have the ref. */
4894     memcpy(ref + PARTITION_BYTES_IN_POS,
4895            queue_top(&m_queue) + PARTITION_BYTES_IN_POS,
4896 	   file->ref_length);
4897 #ifndef DBUG_OFF
4898     /* Verify that the position is correct! */
4899     file->position(record);
4900     DBUG_ASSERT(!memcmp(ref + PARTITION_BYTES_IN_POS, file->ref,
4901                         file->ref_length));
4902 #endif
4903   }
4904   else
4905   {
4906     file->position(record);
4907     memcpy((ref + PARTITION_BYTES_IN_POS), file->ref, file->ref_length);
4908   }
4909   pad_length= m_ref_length - PARTITION_BYTES_IN_POS - file->ref_length;
4910   if (pad_length)
4911     memset((ref + PARTITION_BYTES_IN_POS + file->ref_length), 0, pad_length);
4912 
4913   DBUG_VOID_RETURN;
4914 }
4915 
4916 
4917 /*
4918   Read row using position
4919 
4920   SYNOPSIS
4921     rnd_pos()
4922     out:buf                     Row read in MySQL Row Format
4923     position                    Position of read row
4924 
4925   RETURN VALUE
4926     >0                          Error code
4927     0                           Success
4928 
4929   DESCRIPTION
4930     This is like rnd_next, but you are given a position to use
4931     to determine the row. The position will be of the type that you stored in
4932     ref. You can use ha_get_ptr(pos,ref_length) to retrieve whatever key
4933     or position you saved when position() was called.
4934     Called from filesort.cc records.cc sql_insert.cc sql_select.cc
4935     sql_update.cc.
4936 */
4937 
rnd_pos(uchar * buf,uchar * pos)4938 int ha_partition::rnd_pos(uchar * buf, uchar *pos)
4939 {
4940   uint part_id;
4941   handler *file;
4942   DBUG_ENTER("ha_partition::rnd_pos");
4943 
4944   part_id= uint2korr((const uchar *) pos);
4945   DBUG_ASSERT(part_id < m_tot_parts);
4946   file= m_file[part_id];
4947   DBUG_ASSERT(bitmap_is_set(&(m_part_info->read_partitions), part_id));
4948   m_last_part= part_id;
4949   DBUG_RETURN(file->ha_rnd_pos(buf, (pos + PARTITION_BYTES_IN_POS)));
4950 }
4951 
4952 
4953 /*
4954   Read row using position using given record to find
4955 
4956   SYNOPSIS
4957     rnd_pos_by_record()
4958     record             Current record in MySQL Row Format
4959 
4960   RETURN VALUE
4961     >0                 Error code
4962     0                  Success
4963 
4964   DESCRIPTION
4965     this works as position()+rnd_pos() functions, but does some extra work,
4966     calculating m_last_part - the partition to where the 'record'
4967     should go.
4968 
4969     called from replication (log_event.cc)
4970 */
4971 
rnd_pos_by_record(uchar * record)4972 int ha_partition::rnd_pos_by_record(uchar *record)
4973 {
4974   DBUG_ENTER("ha_partition::rnd_pos_by_record");
4975   if (unlikely(get_part_for_delete(record, m_rec0, m_part_info, &m_last_part)))
4976     DBUG_RETURN(1);
4977   DBUG_RETURN(m_file[m_last_part]->rnd_pos_by_record(record));
4978 }
4979 
4980 /****************************************************************************
4981                 MODULE index scan
4982 ****************************************************************************/
4983 /*
4984   Positions an index cursor to the index specified in the handle. Fetches the
4985   row if available. If the key value is null, begin at the first key of the
4986   index.
4987 
4988   There are loads of optimisations possible here for the partition handler.
4989   The same optimisations can also be checked for full table scan although
4990   only through conditions and not from index ranges.
4991   Phase one optimisations:
4992     Check if the fields of the partition function are bound. If so only use
4993     the single partition it becomes bound to.
4994   Phase two optimisations:
4995     If it can be deducted through range or list partitioning that only a
4996     subset of the partitions are used, then only use those partitions.
4997 */
4998 
4999 /** Compare key and rowid.
5000   Helper function for sorting records in the priority queue.
5001   a/b points to table->record[0] rows which must have the
5002   key fields set. The bytes before a and b store the handler::ref.
5003   This is used for comparing/sorting rows first according to
5004   KEY and if same KEY, by handler::ref (rowid).
5005 
5006   @param key_info  Null terminated array of index information
5007   @param a         Pointer to record+ref in first record
5008   @param b         Pointer to record+ref in second record
5009 
5010   @return Return value is SIGN(first_rec - second_rec)
5011     @retval  0                  Keys are equal
5012     @retval -1                  second_rec is greater than first_rec
5013     @retval +1                  first_rec is greater than second_rec
5014 */
5015 
key_and_ref_cmp(void * key_info,uchar * a,uchar * b)5016 static int key_and_ref_cmp(void* key_info, uchar *a, uchar *b)
5017 {
5018   int cmp= key_rec_cmp(key_info, a, b);
5019   if (cmp)
5020     return cmp;
5021   /*
5022     We must compare by handler::ref, which is added before the record,
5023     in the priority queue.
5024   */
5025   KEY **key = (KEY**)key_info;
5026   uint ref_length= (*key)->table->file->ref_length;
5027   return (*key)->table->file->cmp_ref(a - ref_length, b - ref_length);
5028 }
5029 
5030 
5031 /**
5032   Setup the ordered record buffer and the priority queue.
5033 */
5034 
init_record_priority_queue()5035 bool ha_partition::init_record_priority_queue()
5036 {
5037   DBUG_ENTER("ha_partition::init_record_priority_queue");
5038   DBUG_ASSERT(!m_ordered_rec_buffer);
5039   /*
5040     Initialize the ordered record buffer.
5041   */
5042   if (!m_ordered_rec_buffer)
5043   {
5044     uint alloc_len;
5045     uint used_parts= bitmap_bits_set(&m_part_info->read_partitions);
5046     /*
5047       Allocate record buffer for each used partition.
5048       If we need to do a secondary sort by PK, then it is already in the
5049       record, so we only need to allocate for part id and a full record per
5050       partition.
5051       Otherwise we do a secondary sort by rowid (handler::ref) and must
5052       allocate for ref (includes part id) and full record per partition.
5053       We don't know yet if we need to do secondary sort by rowid, so we must
5054       allocate space for it.
5055     */
5056     if (m_curr_key_info[1])
5057       m_rec_offset= PARTITION_BYTES_IN_POS;
5058     else
5059       m_rec_offset= m_ref_length;
5060     alloc_len= used_parts * (m_rec_offset + m_rec_length);
5061     /* Allocate a key for temporary use when setting up the scan. */
5062     alloc_len+= table_share->max_key_length;
5063 
5064     if (!(m_ordered_rec_buffer= (uchar*)my_malloc(alloc_len, MYF(MY_WME))))
5065       DBUG_RETURN(true);
5066 
5067     /*
5068       We set-up one record per partition and each record has 2 bytes in
5069       front where the partition id is written. This is used by ordered
5070       index_read.
5071       If we need to also sort by rowid (handler::ref), then m_curr_key_info[1]
5072       is NULL and we add the rowid before the record.
5073       We also set-up a reference to the first record for temporary use in
5074       setting up the scan.
5075     */
5076     char *ptr= (char*) m_ordered_rec_buffer;
5077     uint i;
5078     for (i= bitmap_get_first_set(&m_part_info->read_partitions);
5079          i < m_tot_parts;
5080          i= bitmap_get_next_set(&m_part_info->read_partitions, i))
5081     {
5082       DBUG_PRINT("info", ("init rec-buf for part %u", i));
5083       int2store(ptr, i);
5084       ptr+= m_rec_offset + m_rec_length;
5085     }
5086     m_start_key.key= (const uchar*)ptr;
5087     /*
5088       Initialize priority queue, initialized to reading forward.
5089       Start by only sort by KEY, HA_EXTRA_SECONDARY_SORT_ROWID
5090       will be given if we should sort by handler::ref too.
5091     */
5092     if (init_queue(&m_queue, used_parts, m_rec_offset,
5093                    0,
5094                    key_rec_cmp,
5095                    (void*)m_curr_key_info))
5096     {
5097       my_free(m_ordered_rec_buffer);
5098       m_ordered_rec_buffer= NULL;
5099       DBUG_RETURN(true);
5100     }
5101   }
5102   DBUG_RETURN(false);
5103 }
5104 
5105 
5106 /**
5107   Destroy the ordered record buffer and the priority queue.
5108 */
5109 
destroy_record_priority_queue()5110 void ha_partition::destroy_record_priority_queue()
5111 {
5112   DBUG_ENTER("ha_partition::destroy_record_priority_queue");
5113   if (m_ordered_rec_buffer)
5114   {
5115     delete_queue(&m_queue);
5116     my_free(m_ordered_rec_buffer);
5117     m_ordered_rec_buffer= NULL;
5118   }
5119   DBUG_VOID_RETURN;
5120 }
5121 
5122 
5123 /*
5124   Initialize handler before start of index scan
5125 
5126   SYNOPSIS
5127     index_init()
5128     inx                Index number
5129     sorted             Is rows to be returned in sorted order
5130 
5131   RETURN VALUE
5132     >0                 Error code
5133     0                  Success
5134 
5135   DESCRIPTION
5136     index_init is always called before starting index scans (except when
5137     starting through index_read_idx and using read_range variants).
5138 */
5139 
index_init(uint inx,bool sorted)5140 int ha_partition::index_init(uint inx, bool sorted)
5141 {
5142   int error= 0;
5143   uint i;
5144   DBUG_ENTER("ha_partition::index_init");
5145 
5146   DBUG_PRINT("info", ("inx %u sorted %u", inx, sorted));
5147   active_index= inx;
5148   m_part_spec.start_part= NO_CURRENT_PART_ID;
5149   m_start_key.length= 0;
5150   m_ordered= sorted;
5151   m_sec_sort_by_rowid= false;
5152   m_curr_key_info[0]= table->key_info+inx;
5153   m_curr_key_info[1]= NULL;
5154   /*
5155     There are two cases where it is not enough to only sort on the key:
5156     1) For clustered indexes, the optimizer assumes that all keys
5157        have the rest of the PK columns appended to the KEY, so it will
5158        sort by PK as secondary sort key.
5159     2) Rowid-Order-Retrieval access methods, like index_merge_intersect
5160        and index_merge_union. These methods requires the index to be sorted
5161        on rowid (handler::ref) as secondary sort key.
5162   */
5163   if (m_pkey_is_clustered && table->s->primary_key != MAX_KEY)
5164   {
5165     /*
5166       if PK is clustered, then the key cmp must use the pk to
5167       differentiate between equal key in given index.
5168     */
5169     DBUG_PRINT("info", ("Clustered pk, using pk as secondary cmp"));
5170     m_curr_key_info[1]= table->key_info+table->s->primary_key;
5171     m_curr_key_info[2]= NULL;
5172   }
5173 
5174   if (init_record_priority_queue())
5175     DBUG_RETURN(HA_ERR_OUT_OF_MEM);
5176 
5177   /*
5178     Some handlers only read fields as specified by the bitmap for the
5179     read set. For partitioned handlers we always require that the
5180     fields of the partition functions are read such that we can
5181     calculate the partition id to place updated and deleted records.
5182     But this is required for operations that may need to change data only.
5183   */
5184   if (get_lock_type() == F_WRLCK)
5185     bitmap_union(table->read_set, &m_part_info->full_part_field_set);
5186   for (i= bitmap_get_first_set(&m_part_info->read_partitions);
5187        i < m_tot_parts;
5188        i= bitmap_get_next_set(&m_part_info->read_partitions, i))
5189   {
5190     if ((error= m_file[i]->ha_index_init(inx, sorted)))
5191       goto err;
5192 
5193     DBUG_EXECUTE_IF("ha_partition_fail_index_init", {
5194       i++;
5195       error= HA_ERR_NO_PARTITION_FOUND;
5196       goto err;
5197     });
5198   }
5199 err:
5200   if (error)
5201   {
5202     /* End the previously initialized indexes. */
5203     uint j;
5204     for (j= bitmap_get_first_set(&m_part_info->read_partitions);
5205          j < i;
5206          j= bitmap_get_next_set(&m_part_info->read_partitions, j))
5207     {
5208       (void) m_file[j]->ha_index_end();
5209     }
5210     destroy_record_priority_queue();
5211   }
5212   DBUG_RETURN(error);
5213 }
5214 
5215 
5216 /*
5217   End of index scan
5218 
5219   SYNOPSIS
5220     index_end()
5221 
5222   RETURN VALUE
5223     >0                 Error code
5224     0                  Success
5225 
5226   DESCRIPTION
5227     index_end is called at the end of an index scan to clean up any
5228     things needed to clean up.
5229 */
5230 
index_end()5231 int ha_partition::index_end()
5232 {
5233   int error= 0;
5234   uint i;
5235   DBUG_ENTER("ha_partition::index_end");
5236 
5237   active_index= MAX_KEY;
5238   m_part_spec.start_part= NO_CURRENT_PART_ID;
5239   m_sec_sort_by_rowid= false;
5240   for (i= bitmap_get_first_set(&m_part_info->read_partitions);
5241        i < m_tot_parts;
5242        i= bitmap_get_next_set(&m_part_info->read_partitions, i))
5243   {
5244     int tmp;
5245     if ((tmp= m_file[i]->ha_index_end()))
5246       error= tmp;
5247   }
5248   destroy_record_priority_queue();
5249   DBUG_RETURN(error);
5250 }
5251 
5252 
5253 /*
5254   Read one record in an index scan and start an index scan
5255 
5256   SYNOPSIS
5257     index_read_map()
5258     buf                    Read row in MySQL Row Format
5259     key                    Key parts in consecutive order
5260     keypart_map            Which part of key is used
5261     find_flag              What type of key condition is used
5262 
5263   RETURN VALUE
5264     >0                 Error code
5265     0                  Success
5266 
5267   DESCRIPTION
5268     index_read_map starts a new index scan using a start key. The MySQL Server
5269     will check the end key on its own. Thus to function properly the
5270     partitioned handler need to ensure that it delivers records in the sort
5271     order of the MySQL Server.
5272     index_read_map can be restarted without calling index_end on the previous
5273     index scan and without calling index_init. In this case the index_read_map
5274     is on the same index as the previous index_scan. This is particularly
5275     used in conjuntion with multi read ranges.
5276 */
5277 
index_read_map(uchar * buf,const uchar * key,key_part_map keypart_map,enum ha_rkey_function find_flag)5278 int ha_partition::index_read_map(uchar *buf, const uchar *key,
5279                                  key_part_map keypart_map,
5280                                  enum ha_rkey_function find_flag)
5281 {
5282   DBUG_ENTER("ha_partition::index_read_map");
5283   end_range= 0;
5284   m_index_scan_type= partition_index_read;
5285   m_start_key.key= key;
5286   m_start_key.keypart_map= keypart_map;
5287   m_start_key.flag= find_flag;
5288   DBUG_RETURN(common_index_read(buf, TRUE));
5289 }
5290 
5291 
5292 /**
5293   Common routine for a number of index_read variants
5294 
5295   @param buf             Buffer where the record should be returned.
5296   @param have_start_key  TRUE <=> the left endpoint is available, i.e.
5297                          we're in index_read call or in read_range_first
5298                          call and the range has left endpoint.
5299                          FALSE <=> there is no left endpoint (we're in
5300                          read_range_first() call and the range has no left
5301                          endpoint).
5302 
5303   @return Operation status
5304     @retval 0      OK
5305     @retval HA_ERR_END_OF_FILE   Whole index scanned, without finding the record.
5306     @retval HA_ERR_KEY_NOT_FOUND Record not found, but index cursor positioned.
5307     @retval other  error code.
5308 
5309   @details
5310     Start scanning the range (when invoked from read_range_first()) or doing
5311     an index lookup (when invoked from index_read_XXX):
5312      - If possible, perform partition selection
5313      - Find the set of partitions we're going to use
5314      - Depending on whether we need ordering:
5315         NO:  Get the first record from first used partition (see
5316              handle_unordered_scan_next_partition)
5317         YES: Fill the priority queue and get the record that is the first in
5318              the ordering
5319 */
5320 
common_index_read(uchar * buf,bool have_start_key)5321 int ha_partition::common_index_read(uchar *buf, bool have_start_key)
5322 {
5323   int error;
5324   uint UNINIT_VAR(key_len); /* used if have_start_key==TRUE */
5325   bool reverse_order= FALSE;
5326   DBUG_ENTER("ha_partition::common_index_read");
5327 
5328   DBUG_PRINT("info", ("m_ordered %u m_ordered_scan_ong %u",
5329                       m_ordered, m_ordered_scan_ongoing));
5330 
5331   if (have_start_key)
5332   {
5333     m_start_key.length= key_len= calculate_key_len(table, active_index,
5334                                                    m_start_key.key,
5335                                                    m_start_key.keypart_map);
5336     DBUG_PRINT("info", ("have_start_key map %lu find_flag %u len %u",
5337                         m_start_key.keypart_map, m_start_key.flag, key_len));
5338     DBUG_ASSERT(key_len);
5339   }
5340   if ((error= partition_scan_set_up(buf, have_start_key)))
5341   {
5342     DBUG_RETURN(error);
5343   }
5344 
5345   if (have_start_key &&
5346       (m_start_key.flag == HA_READ_PREFIX_LAST ||
5347        m_start_key.flag == HA_READ_PREFIX_LAST_OR_PREV ||
5348        m_start_key.flag == HA_READ_BEFORE_KEY))
5349   {
5350     reverse_order= TRUE;
5351     m_ordered_scan_ongoing= TRUE;
5352   }
5353   DBUG_PRINT("info", ("m_ordered %u m_o_scan_ong %u have_start_key %u",
5354                       m_ordered, m_ordered_scan_ongoing, have_start_key));
5355   if (!m_ordered_scan_ongoing)
5356    {
5357     /*
5358       We use unordered index scan when read_range is used and flag
5359       is set to not use ordered.
5360       We also use an unordered index scan when the number of partitions to
5361       scan is only one.
5362       The unordered index scan will use the partition set created.
5363     */
5364     DBUG_PRINT("info", ("doing unordered scan"));
5365     error= handle_unordered_scan_next_partition(buf);
5366   }
5367   else
5368   {
5369     /*
5370       In all other cases we will use the ordered index scan. This will use
5371       the partition set created by the get_partition_set method.
5372     */
5373     error= handle_ordered_index_scan(buf, reverse_order);
5374   }
5375   DBUG_RETURN(error);
5376 }
5377 
5378 
5379 /*
5380   Start an index scan from leftmost record and return first record
5381 
5382   SYNOPSIS
5383     index_first()
5384     buf                 Read row in MySQL Row Format
5385 
5386   RETURN VALUE
5387     >0                  Error code
5388     0                   Success
5389 
5390   DESCRIPTION
5391     index_first() asks for the first key in the index.
5392     This is similar to index_read except that there is no start key since
5393     the scan starts from the leftmost entry and proceeds forward with
5394     index_next.
5395 
5396     Called from opt_range.cc, opt_sum.cc, sql_handler.cc,
5397     and sql_select.cc.
5398 */
5399 
index_first(uchar * buf)5400 int ha_partition::index_first(uchar * buf)
5401 {
5402   DBUG_ENTER("ha_partition::index_first");
5403 
5404   end_range= 0;
5405   m_index_scan_type= partition_index_first;
5406   DBUG_RETURN(common_first_last(buf));
5407 }
5408 
5409 
5410 /*
5411   Start an index scan from rightmost record and return first record
5412 
5413   SYNOPSIS
5414     index_last()
5415     buf                 Read row in MySQL Row Format
5416 
5417   RETURN VALUE
5418     >0                  Error code
5419     0                   Success
5420 
5421   DESCRIPTION
5422     index_last() asks for the last key in the index.
5423     This is similar to index_read except that there is no start key since
5424     the scan starts from the rightmost entry and proceeds forward with
5425     index_prev.
5426 
5427     Called from opt_range.cc, opt_sum.cc, sql_handler.cc,
5428     and sql_select.cc.
5429 */
5430 
index_last(uchar * buf)5431 int ha_partition::index_last(uchar * buf)
5432 {
5433   DBUG_ENTER("ha_partition::index_last");
5434   int error = HA_ERR_END_OF_FILE;
5435   uint part_id = bitmap_get_first_set(&(m_part_info->read_partitions));
5436   if (part_id == MY_BIT_NONE)
5437   {
5438 	/* No partition to scan. */
5439 	DBUG_RETURN(error);
5440   }
5441   m_index_scan_type= partition_index_last;
5442   DBUG_RETURN(common_first_last(buf));
5443 
5444 }
5445 
5446 /*
5447   Common routine for index_first/index_last
5448 
5449   SYNOPSIS
5450     ha_partition::common_first_last()
5451 
5452   see index_first for rest
5453 */
5454 
common_first_last(uchar * buf)5455 int ha_partition::common_first_last(uchar *buf)
5456 {
5457   int error;
5458 
5459   if ((error= partition_scan_set_up(buf, FALSE)))
5460     return error;
5461   if (!m_ordered_scan_ongoing &&
5462       m_index_scan_type != partition_index_last)
5463     return handle_unordered_scan_next_partition(buf);
5464   return handle_ordered_index_scan(buf, FALSE);
5465 }
5466 
5467 
5468 /*
5469   Read last using key
5470 
5471   SYNOPSIS
5472     index_read_last_map()
5473     buf                   Read row in MySQL Row Format
5474     key                   Key
5475     keypart_map           Which part of key is used
5476 
5477   RETURN VALUE
5478     >0                    Error code
5479     0                     Success
5480 
5481   DESCRIPTION
5482     This is used in join_read_last_key to optimise away an ORDER BY.
5483     Can only be used on indexes supporting HA_READ_ORDER
5484 */
5485 
index_read_last_map(uchar * buf,const uchar * key,key_part_map keypart_map)5486 int ha_partition::index_read_last_map(uchar *buf, const uchar *key,
5487                                       key_part_map keypart_map)
5488 {
5489   DBUG_ENTER("ha_partition::index_read_last_map");
5490 
5491   m_ordered= TRUE;				// Safety measure
5492   end_range= 0;
5493   m_index_scan_type= partition_index_read_last;
5494   m_start_key.key= key;
5495   m_start_key.keypart_map= keypart_map;
5496   m_start_key.flag= HA_READ_PREFIX_LAST;
5497   DBUG_RETURN(common_index_read(buf, TRUE));
5498 }
5499 
5500 
5501 /*
5502   Optimization of the default implementation to take advantage of dynamic
5503   partition pruning.
5504 */
index_read_idx_map(uchar * buf,uint index,const uchar * key,key_part_map keypart_map,enum ha_rkey_function find_flag)5505 int ha_partition::index_read_idx_map(uchar *buf, uint index,
5506                                      const uchar *key,
5507                                      key_part_map keypart_map,
5508                                      enum ha_rkey_function find_flag)
5509 {
5510   int error= HA_ERR_KEY_NOT_FOUND;
5511   DBUG_ENTER("ha_partition::index_read_idx_map");
5512 
5513   if (find_flag == HA_READ_KEY_EXACT)
5514   {
5515     uint part;
5516     m_start_key.key= key;
5517     m_start_key.keypart_map= keypart_map;
5518     m_start_key.flag= find_flag;
5519     m_start_key.length= calculate_key_len(table, index, m_start_key.key,
5520                                           m_start_key.keypart_map);
5521 
5522     get_partition_set(table, buf, index, &m_start_key, &m_part_spec);
5523 
5524     /*
5525       We have either found exactly 1 partition
5526       (in which case start_part == end_part)
5527       or no matching partitions (start_part > end_part)
5528     */
5529     DBUG_ASSERT(m_part_spec.start_part >= m_part_spec.end_part);
5530     /* The start part is must be marked as used. */
5531     DBUG_ASSERT(m_part_spec.start_part > m_part_spec.end_part ||
5532                 bitmap_is_set(&(m_part_info->read_partitions),
5533                               m_part_spec.start_part));
5534 
5535     for (part= m_part_spec.start_part;
5536          part <= m_part_spec.end_part;
5537          part= bitmap_get_next_set(&m_part_info->read_partitions, part))
5538     {
5539       error= m_file[part]->ha_index_read_idx_map(buf, index, key,
5540                                                  keypart_map, find_flag);
5541       if (error != HA_ERR_KEY_NOT_FOUND &&
5542           error != HA_ERR_END_OF_FILE)
5543         break;
5544     }
5545     if (part <= m_part_spec.end_part)
5546       m_last_part= part;
5547   }
5548   else
5549   {
5550     /*
5551       If not only used with READ_EXACT, we should investigate if possible
5552       to optimize for other find_flag's as well.
5553     */
5554     DBUG_ASSERT(0);
5555     /* fall back on the default implementation */
5556     error= handler::index_read_idx_map(buf, index, key, keypart_map, find_flag);
5557   }
5558   DBUG_RETURN(error);
5559 }
5560 
5561 
5562 /*
5563   Read next record in a forward index scan
5564 
5565   SYNOPSIS
5566     index_next()
5567     buf                   Read row in MySQL Row Format
5568 
5569   RETURN VALUE
5570     >0                    Error code
5571     0                     Success
5572 
5573   DESCRIPTION
5574     Used to read forward through the index.
5575 */
5576 
index_next(uchar * buf)5577 int ha_partition::index_next(uchar * buf)
5578 {
5579   DBUG_ENTER("ha_partition::index_next");
5580 
5581   /*
5582     TODO(low priority):
5583     If we want partition to work with the HANDLER commands, we
5584     must be able to do index_last() -> index_prev() -> index_next()
5585     and if direction changes, we must step back those partitions in
5586     the record queue so we don't return a value from the wrong direction.
5587   */
5588   DBUG_ASSERT(m_index_scan_type != partition_index_last);
5589   if (!m_ordered_scan_ongoing)
5590   {
5591     DBUG_RETURN(handle_unordered_next(buf, FALSE));
5592   }
5593   DBUG_RETURN(handle_ordered_next(buf, FALSE));
5594 }
5595 
5596 
5597 /*
5598   Read next record special
5599 
5600   SYNOPSIS
5601     index_next_same()
5602     buf                   Read row in MySQL Row Format
5603     key                   Key
5604     keylen                Length of key
5605 
5606   RETURN VALUE
5607     >0                    Error code
5608     0                     Success
5609 
5610   DESCRIPTION
5611     This routine is used to read the next but only if the key is the same
5612     as supplied in the call.
5613 */
5614 
index_next_same(uchar * buf,const uchar * key,uint keylen)5615 int ha_partition::index_next_same(uchar *buf, const uchar *key, uint keylen)
5616 {
5617   DBUG_ENTER("ha_partition::index_next_same");
5618 
5619   DBUG_ASSERT(keylen == m_start_key.length);
5620   DBUG_ASSERT(m_index_scan_type != partition_index_last);
5621   if (!m_ordered_scan_ongoing)
5622     DBUG_RETURN(handle_unordered_next(buf, TRUE));
5623   DBUG_RETURN(handle_ordered_next(buf, TRUE));
5624 }
5625 
5626 
5627 /*
5628   Read next record when performing index scan backwards
5629 
5630   SYNOPSIS
5631     index_prev()
5632     buf                   Read row in MySQL Row Format
5633 
5634   RETURN VALUE
5635     >0                    Error code
5636     0                     Success
5637 
5638   DESCRIPTION
5639     Used to read backwards through the index.
5640 */
5641 
index_prev(uchar * buf)5642 int ha_partition::index_prev(uchar * buf)
5643 {
5644   DBUG_ENTER("ha_partition::index_prev");
5645 
5646   /* TODO: read comment in index_next */
5647   DBUG_ASSERT(m_index_scan_type != partition_index_first);
5648   DBUG_RETURN(handle_ordered_prev(buf));
5649 }
5650 
5651 
5652 /*
5653   Start a read of one range with start and end key
5654 
5655   SYNOPSIS
5656     read_range_first()
5657     start_key           Specification of start key
5658     end_key             Specification of end key
5659     eq_range_arg        Is it equal range
5660     sorted              Should records be returned in sorted order
5661 
5662   RETURN VALUE
5663     >0                    Error code
5664     0                     Success
5665 
5666   DESCRIPTION
5667     We reimplement read_range_first since we don't want the compare_key
5668     check at the end. This is already performed in the partition handler.
5669     read_range_next is very much different due to that we need to scan
5670     all underlying handlers.
5671 */
5672 
read_range_first(const key_range * start_key,const key_range * end_key,bool eq_range_arg,bool sorted)5673 int ha_partition::read_range_first(const key_range *start_key,
5674 				   const key_range *end_key,
5675 				   bool eq_range_arg, bool sorted)
5676 {
5677   int error;
5678   DBUG_ENTER("ha_partition::read_range_first");
5679 
5680   m_ordered= sorted;
5681   eq_range= eq_range_arg;
5682   set_end_range(end_key, RANGE_SCAN_ASC);
5683 
5684   range_key_part= m_curr_key_info[0]->key_part;
5685   if (start_key)
5686     m_start_key= *start_key;
5687   else
5688     m_start_key.key= NULL;
5689 
5690   m_index_scan_type= partition_read_range;
5691   error= common_index_read(m_rec0, MY_TEST(start_key));
5692   DBUG_RETURN(error);
5693 }
5694 
5695 
5696 /*
5697   Read next record in read of a range with start and end key
5698 
5699   SYNOPSIS
5700     read_range_next()
5701 
5702   RETURN VALUE
5703     >0                    Error code
5704     0                     Success
5705 */
5706 
read_range_next()5707 int ha_partition::read_range_next()
5708 {
5709   DBUG_ENTER("ha_partition::read_range_next");
5710 
5711   if (m_ordered_scan_ongoing)
5712   {
5713     DBUG_RETURN(handle_ordered_next(table->record[0], eq_range));
5714   }
5715   DBUG_RETURN(handle_unordered_next(table->record[0], eq_range));
5716 }
5717 
5718 
5719 /*
5720   Common routine to set up index scans
5721 
5722   SYNOPSIS
5723     ha_partition::partition_scan_set_up()
5724       buf            Buffer to later return record in (this function
5725                      needs it to calculcate partitioning function
5726                      values)
5727 
5728       idx_read_flag  TRUE <=> m_start_key has range start endpoint which
5729                      probably can be used to determine the set of partitions
5730                      to scan.
5731                      FALSE <=> there is no start endpoint.
5732 
5733   DESCRIPTION
5734     Find out which partitions we'll need to read when scanning the specified
5735     range.
5736 
5737     If we need to scan only one partition, set m_ordered_scan_ongoing=FALSE
5738     as we will not need to do merge ordering.
5739 
5740   RETURN VALUE
5741     >0                    Error code
5742     0                     Success
5743 */
5744 
partition_scan_set_up(uchar * buf,bool idx_read_flag)5745 int ha_partition::partition_scan_set_up(uchar * buf, bool idx_read_flag)
5746 {
5747   DBUG_ENTER("ha_partition::partition_scan_set_up");
5748 
5749   if (idx_read_flag)
5750     get_partition_set(table,buf,active_index,&m_start_key,&m_part_spec);
5751   else
5752   {
5753     m_part_spec.start_part= 0;
5754     m_part_spec.end_part= m_tot_parts - 1;
5755   }
5756   if (m_part_spec.start_part > m_part_spec.end_part)
5757   {
5758     /*
5759       We discovered a partition set but the set was empty so we report
5760       key not found.
5761     */
5762     DBUG_PRINT("info", ("scan with no partition to scan"));
5763     table->status= STATUS_NOT_FOUND;
5764     DBUG_RETURN(HA_ERR_END_OF_FILE);
5765   }
5766   if (m_part_spec.start_part == m_part_spec.end_part)
5767   {
5768     /*
5769       We discovered a single partition to scan, this never needs to be
5770       performed using the ordered index scan.
5771     */
5772     DBUG_PRINT("info", ("index scan using the single partition %d",
5773 			m_part_spec.start_part));
5774     m_ordered_scan_ongoing= FALSE;
5775   }
5776   else
5777   {
5778     /*
5779       Set m_ordered_scan_ongoing according how the scan should be done
5780       Only exact partitions are discovered atm by get_partition_set.
5781       Verify this, also bitmap must have at least one bit set otherwise
5782       the result from this table is the empty set.
5783     */
5784     uint start_part= bitmap_get_first_set(&(m_part_info->read_partitions));
5785     if (start_part == MY_BIT_NONE)
5786     {
5787       DBUG_PRINT("info", ("scan with no partition to scan"));
5788       table->status= STATUS_NOT_FOUND;
5789       DBUG_RETURN(HA_ERR_END_OF_FILE);
5790     }
5791     if (start_part > m_part_spec.start_part)
5792       m_part_spec.start_part= start_part;
5793     DBUG_ASSERT(m_part_spec.start_part < m_tot_parts);
5794     m_ordered_scan_ongoing= m_ordered;
5795   }
5796   DBUG_ASSERT(m_part_spec.start_part < m_tot_parts &&
5797               m_part_spec.end_part < m_tot_parts);
5798   DBUG_RETURN(0);
5799 }
5800 
5801 
5802 /****************************************************************************
5803   Unordered Index Scan Routines
5804 ****************************************************************************/
5805 /*
5806   Common routine to handle index_next with unordered results
5807 
5808   SYNOPSIS
5809     handle_unordered_next()
5810     out:buf                       Read row in MySQL Row Format
5811     next_same                     Called from index_next_same
5812 
5813   RETURN VALUE
5814     HA_ERR_END_OF_FILE            End of scan
5815     0                             Success
5816     other                         Error code
5817 
5818   DESCRIPTION
5819     These routines are used to scan partitions without considering order.
5820     This is performed in two situations.
5821     1) In read_multi_range this is the normal case
5822     2) When performing any type of index_read, index_first, index_last where
5823     all fields in the partition function is bound. In this case the index
5824     scan is performed on only one partition and thus it isn't necessary to
5825     perform any sort.
5826 */
5827 
handle_unordered_next(uchar * buf,bool is_next_same)5828 int ha_partition::handle_unordered_next(uchar *buf, bool is_next_same)
5829 {
5830   handler *file;
5831   int error;
5832   DBUG_ENTER("ha_partition::handle_unordered_next");
5833 
5834   if (m_part_spec.start_part >= m_tot_parts)
5835   {
5836     /* Should never happen! */
5837     DBUG_ASSERT(0);
5838     DBUG_RETURN(HA_ERR_END_OF_FILE);
5839   }
5840   file= m_file[m_part_spec.start_part];
5841 
5842   /*
5843     We should consider if this should be split into three functions as
5844     partition_read_range is_next_same are always local constants
5845   */
5846 
5847   if (m_index_scan_type == partition_read_range)
5848   {
5849     if (!(error= file->read_range_next()))
5850     {
5851       m_last_part= m_part_spec.start_part;
5852       DBUG_RETURN(0);
5853     }
5854   }
5855   else if (is_next_same)
5856   {
5857     if (!(error= file->ha_index_next_same(buf, m_start_key.key,
5858                                           m_start_key.length)))
5859     {
5860       m_last_part= m_part_spec.start_part;
5861       DBUG_RETURN(0);
5862     }
5863   }
5864   else
5865   {
5866     if (!(error= file->ha_index_next(buf)))
5867     {
5868       m_last_part= m_part_spec.start_part;
5869       DBUG_RETURN(0);                           // Row was in range
5870     }
5871   }
5872 
5873   if (error == HA_ERR_END_OF_FILE)
5874   {
5875     m_part_spec.start_part++;                    // Start using next part
5876     error= handle_unordered_scan_next_partition(buf);
5877   }
5878   DBUG_RETURN(error);
5879 }
5880 
5881 
5882 /*
5883   Handle index_next when changing to new partition
5884 
5885   SYNOPSIS
5886     handle_unordered_scan_next_partition()
5887     buf                       Read row in MySQL Row Format
5888 
5889   RETURN VALUE
5890     HA_ERR_END_OF_FILE            End of scan
5891     0                             Success
5892     other                         Error code
5893 
5894   DESCRIPTION
5895     This routine is used to start the index scan on the next partition.
5896     Both initial start and after completing scan on one partition.
5897 */
5898 
handle_unordered_scan_next_partition(uchar * buf)5899 int ha_partition::handle_unordered_scan_next_partition(uchar * buf)
5900 {
5901   uint i= m_part_spec.start_part;
5902   int saved_error= HA_ERR_END_OF_FILE;
5903   DBUG_ENTER("ha_partition::handle_unordered_scan_next_partition");
5904 
5905   if (i)
5906     i= bitmap_get_next_set(&m_part_info->read_partitions, i - 1);
5907   else
5908     i= bitmap_get_first_set(&m_part_info->read_partitions);
5909 
5910   for (;
5911        i <= m_part_spec.end_part;
5912        i= bitmap_get_next_set(&m_part_info->read_partitions, i))
5913   {
5914     int error;
5915     handler *file= m_file[i];
5916     m_part_spec.start_part= i;
5917     switch (m_index_scan_type) {
5918     case partition_read_range:
5919       DBUG_PRINT("info", ("read_range_first on partition %d", i));
5920       error= file->read_range_first(m_start_key.key? &m_start_key: NULL,
5921                                     end_range, eq_range, FALSE);
5922       break;
5923     case partition_index_read:
5924       DBUG_PRINT("info", ("index_read on partition %d", i));
5925       error= file->ha_index_read_map(buf, m_start_key.key,
5926                                      m_start_key.keypart_map,
5927                                      m_start_key.flag);
5928       break;
5929     case partition_index_first:
5930       DBUG_PRINT("info", ("index_first on partition %d", i));
5931       error= file->ha_index_first(buf);
5932       break;
5933     case partition_index_first_unordered:
5934       /*
5935         We perform a scan without sorting and this means that we
5936         should not use the index_first since not all handlers
5937         support it and it is also unnecessary to restrict sort
5938         order.
5939       */
5940       DBUG_PRINT("info", ("read_range_first on partition %d", i));
5941       table->record[0]= buf;
5942       error= file->read_range_first(0, end_range, eq_range, 0);
5943       table->record[0]= m_rec0;
5944       break;
5945     default:
5946       DBUG_ASSERT(FALSE);
5947       DBUG_RETURN(1);
5948     }
5949     if (!error)
5950     {
5951       m_last_part= i;
5952       DBUG_RETURN(0);
5953     }
5954     if ((error != HA_ERR_END_OF_FILE) && (error != HA_ERR_KEY_NOT_FOUND))
5955       DBUG_RETURN(error);
5956 
5957     /*
5958       If HA_ERR_KEY_NOT_FOUND, we must return that error instead of
5959       HA_ERR_END_OF_FILE, to be able to continue search.
5960     */
5961     if (saved_error != HA_ERR_KEY_NOT_FOUND)
5962       saved_error= error;
5963     DBUG_PRINT("info", ("END_OF_FILE/KEY_NOT_FOUND on partition %d", i));
5964   }
5965   if (saved_error == HA_ERR_END_OF_FILE)
5966     m_part_spec.start_part= NO_CURRENT_PART_ID;
5967   DBUG_RETURN(saved_error);
5968 }
5969 
5970 
5971 /**
5972   Common routine to start index scan with ordered results.
5973 
5974   @param[out] buf  Read row in MySQL Row Format
5975 
5976   @return Operation status
5977     @retval HA_ERR_END_OF_FILE  End of scan
5978     @retval HA_ERR_KEY_NOT_FOUNE  End of scan
5979     @retval 0                   Success
5980     @retval other               Error code
5981 
5982   @details
5983     This part contains the logic to handle index scans that require ordered
5984     output. This includes all except those started by read_range_first with
5985     the flag ordered set to FALSE. Thus most direct index_read and all
5986     index_first and index_last.
5987 
5988     We implement ordering by keeping one record plus a key buffer for each
5989     partition. Every time a new entry is requested we will fetch a new
5990     entry from the partition that is currently not filled with an entry.
5991     Then the entry is put into its proper sort position.
5992 
5993     Returning a record is done by getting the top record, copying the
5994     record to the request buffer and setting the partition as empty on
5995     entries.
5996 */
5997 
handle_ordered_index_scan(uchar * buf,bool reverse_order)5998 int ha_partition::handle_ordered_index_scan(uchar *buf, bool reverse_order)
5999 {
6000   uint i;
6001   uint j= 0;
6002   bool found= FALSE;
6003   uchar *part_rec_buf_ptr= m_ordered_rec_buffer;
6004   int saved_error= HA_ERR_END_OF_FILE;
6005   DBUG_ENTER("ha_partition::handle_ordered_index_scan");
6006 
6007   if (m_key_not_found)
6008   {
6009     m_key_not_found= false;
6010     bitmap_clear_all(&m_key_not_found_partitions);
6011   }
6012   m_top_entry= NO_CURRENT_PART_ID;
6013   queue_remove_all(&m_queue);
6014   DBUG_ASSERT(bitmap_is_set(&m_part_info->read_partitions,
6015                             m_part_spec.start_part));
6016 
6017   /*
6018     Position part_rec_buf_ptr to point to the first used partition >=
6019     start_part. There may be partitions marked by used_partitions,
6020     but is before start_part. These partitions has allocated record buffers
6021     but is dynamically pruned, so those buffers must be skipped.
6022   */
6023   for (i= bitmap_get_first_set(&m_part_info->read_partitions);
6024        i < m_part_spec.start_part;
6025        i= bitmap_get_next_set(&m_part_info->read_partitions, i))
6026   {
6027     part_rec_buf_ptr+= m_rec_offset + m_rec_length;
6028   }
6029   DBUG_PRINT("info", ("m_part_spec.start_part %u first_used_part %u",
6030                       m_part_spec.start_part, i));
6031   for (/* continue from above */ ;
6032        i <= m_part_spec.end_part;
6033        i= bitmap_get_next_set(&m_part_info->read_partitions, i))
6034   {
6035     DBUG_PRINT("info", ("reading from part %u (scan_type: %u)",
6036                         i, m_index_scan_type));
6037     DBUG_ASSERT(i == uint2korr(part_rec_buf_ptr));
6038     uchar *rec_buf_ptr= part_rec_buf_ptr + m_rec_offset;
6039     int error;
6040     handler *file= m_file[i];
6041 
6042     switch (m_index_scan_type) {
6043     case partition_index_read:
6044       error= file->ha_index_read_map(rec_buf_ptr,
6045                                      m_start_key.key,
6046                                      m_start_key.keypart_map,
6047                                      m_start_key.flag);
6048       break;
6049     case partition_index_first:
6050       error= file->ha_index_first(rec_buf_ptr);
6051       reverse_order= FALSE;
6052       break;
6053     case partition_index_last:
6054       error= file->ha_index_last(rec_buf_ptr);
6055       reverse_order= TRUE;
6056       break;
6057     case partition_index_read_last:
6058       error= file->ha_index_read_last_map(rec_buf_ptr,
6059                                           m_start_key.key,
6060                                           m_start_key.keypart_map);
6061       reverse_order= TRUE;
6062       break;
6063     case partition_read_range:
6064     {
6065       /*
6066         This can only read record to table->record[0], as it was set when
6067         the table was being opened. We have to memcpy data ourselves.
6068       */
6069       error= file->read_range_first(m_start_key.key? &m_start_key: NULL,
6070                                     end_range, eq_range, TRUE);
6071       memcpy(rec_buf_ptr, table->record[0], m_rec_length);
6072       reverse_order= FALSE;
6073       break;
6074     }
6075     default:
6076       DBUG_ASSERT(FALSE);
6077       DBUG_RETURN(HA_ERR_END_OF_FILE);
6078     }
6079     if (!error)
6080     {
6081       found= TRUE;
6082       if (m_sec_sort_by_rowid)
6083       {
6084         file->position(rec_buf_ptr);
6085         memcpy(part_rec_buf_ptr + PARTITION_BYTES_IN_POS,
6086                file->ref, file->ref_length);
6087       }
6088       /*
6089         Initialize queue without order first, simply insert
6090       */
6091       queue_element(&m_queue, j++)= part_rec_buf_ptr;
6092     }
6093     else if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE)
6094     {
6095       DBUG_RETURN(error);
6096     }
6097     else if (error == HA_ERR_KEY_NOT_FOUND)
6098     {
6099       DBUG_PRINT("info", ("HA_ERR_KEY_NOT_FOUND from partition %u", i));
6100       bitmap_set_bit(&m_key_not_found_partitions, i);
6101       m_key_not_found= true;
6102       saved_error= error;
6103     }
6104     part_rec_buf_ptr+= m_rec_offset + m_rec_length;
6105   }
6106   if (found)
6107   {
6108     /*
6109       We found at least one partition with data, now sort all entries and
6110       after that read the first entry and copy it to the buffer to return in.
6111     */
6112     queue_set_max_at_top(&m_queue, reverse_order);
6113     queue_set_cmp_arg(&m_queue, (void*)m_curr_key_info);
6114     DBUG_ASSERT(m_queue.elements == 0);
6115     /*
6116       If PK, we should not sort by rowid, since that is already done
6117       through the KEY setup.
6118     */
6119     DBUG_ASSERT(!m_curr_key_info[1] || !m_sec_sort_by_rowid);
6120     m_queue.elements= j;
6121     queue_fix(&m_queue);
6122     return_top_record(buf);
6123     table->status= 0;
6124     DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry));
6125     DBUG_RETURN(0);
6126   }
6127   DBUG_RETURN(saved_error);
6128 }
6129 
6130 
6131 /*
6132   Return the top record in sort order
6133 
6134   SYNOPSIS
6135     return_top_record()
6136     out:buf                  Row returned in MySQL Row Format
6137 
6138   RETURN VALUE
6139     NONE
6140 */
6141 
return_top_record(uchar * buf)6142 void ha_partition::return_top_record(uchar *buf)
6143 {
6144   uint part_id;
6145   uchar *key_buffer= queue_top(&m_queue);
6146   uchar *rec_buffer= key_buffer + m_rec_offset;
6147 
6148   part_id= uint2korr(key_buffer);
6149   /* Do column copy to avoid overwriting of non read columns
6150   specific to table with innodb engine */
6151   handler *file= m_file[part_id];
6152   file->copy_cached_row(buf, rec_buffer, m_rec_length);
6153   m_last_part= part_id;
6154   m_top_entry= part_id;
6155 }
6156 
6157 
6158 /**
6159   Add index_next/prev from partitions without exact match.
6160 
6161   If there where any partitions that returned HA_ERR_KEY_NOT_FOUND when
6162   ha_index_read_map was done, those partitions must be included in the
6163   following index_next/prev call.
6164 */
6165 
handle_ordered_index_scan_key_not_found()6166 int ha_partition::handle_ordered_index_scan_key_not_found()
6167 {
6168   int error;
6169   uint i, old_elements= m_queue.elements;
6170   uchar *part_buf= m_ordered_rec_buffer;
6171   uchar *curr_rec_buf= NULL;
6172   DBUG_ENTER("ha_partition::handle_ordered_index_scan_key_not_found");
6173   DBUG_ASSERT(m_key_not_found);
6174   /*
6175     Loop over all used partitions to get the correct offset
6176     into m_ordered_rec_buffer.
6177   */
6178   for (i= bitmap_get_first_set(&m_part_info->read_partitions);
6179        i < m_tot_parts;
6180        i= bitmap_get_next_set(&m_part_info->read_partitions, i))
6181   {
6182     if (bitmap_is_set(&m_key_not_found_partitions, i))
6183     {
6184       /*
6185         This partition is used and did return HA_ERR_KEY_NOT_FOUND
6186         in index_read_map.
6187       */
6188       curr_rec_buf= part_buf + m_rec_offset;
6189       error= m_file[i]->ha_index_next(curr_rec_buf);
6190       /* HA_ERR_KEY_NOT_FOUND is not allowed from index_next! */
6191       DBUG_ASSERT(error != HA_ERR_KEY_NOT_FOUND);
6192       if (!error)
6193       {
6194         if (m_sec_sort_by_rowid)
6195         {
6196           m_file[i]->position(curr_rec_buf);
6197           memcpy(part_buf + PARTITION_BYTES_IN_POS,
6198                  m_file[i]->ref,
6199                  m_file[i]->ref_length);
6200         }
6201         queue_insert(&m_queue, part_buf);
6202       }
6203       else if (error != HA_ERR_END_OF_FILE && error != HA_ERR_KEY_NOT_FOUND)
6204         DBUG_RETURN(error);
6205     }
6206     part_buf+= m_rec_offset + m_rec_length;
6207   }
6208   DBUG_ASSERT(curr_rec_buf);
6209   bitmap_clear_all(&m_key_not_found_partitions);
6210   m_key_not_found= false;
6211 
6212   if (m_queue.elements > old_elements)
6213   {
6214     /* Update m_top_entry, which may have changed. */
6215     uchar *key_buffer= queue_top(&m_queue);
6216     m_top_entry= uint2korr(key_buffer);
6217   }
6218   DBUG_RETURN(0);
6219 }
6220 
6221 
6222 /*
6223   Common routine to handle index_next with ordered results
6224 
6225   SYNOPSIS
6226     handle_ordered_next()
6227     out:buf                       Read row in MySQL Row Format
6228     next_same                     Called from index_next_same
6229 
6230   RETURN VALUE
6231     HA_ERR_END_OF_FILE            End of scan
6232     0                             Success
6233     other                         Error code
6234 */
6235 
handle_ordered_next(uchar * buf,bool is_next_same)6236 int ha_partition::handle_ordered_next(uchar *buf, bool is_next_same)
6237 {
6238   int error;
6239   uint part_id= m_top_entry;
6240   uchar *rec_buf= queue_top(&m_queue) + m_rec_offset;
6241   handler *file;
6242   DBUG_ENTER("ha_partition::handle_ordered_next");
6243 
6244   if (m_key_not_found)
6245   {
6246     if (is_next_same)
6247     {
6248       /* Only rows which match the key. */
6249       m_key_not_found= false;
6250       bitmap_clear_all(&m_key_not_found_partitions);
6251     }
6252     else
6253     {
6254       /* There are partitions not included in the index record queue. */
6255       uint old_elements= m_queue.elements;
6256       if ((error= handle_ordered_index_scan_key_not_found()))
6257         DBUG_RETURN(error);
6258       /*
6259         If the queue top changed, i.e. one of the partitions that gave
6260         HA_ERR_KEY_NOT_FOUND in index_read_map found the next record,
6261         return it.
6262         Otherwise replace the old with a call to index_next (fall through).
6263       */
6264       if (old_elements != m_queue.elements && part_id != m_top_entry)
6265       {
6266         return_top_record(buf);
6267         DBUG_RETURN(0);
6268       }
6269     }
6270   }
6271   if (part_id >= m_tot_parts)
6272     DBUG_RETURN(HA_ERR_END_OF_FILE);
6273 
6274   file= m_file[part_id];
6275 
6276   if (m_index_scan_type == partition_read_range)
6277   {
6278     error= file->read_range_next();
6279     memcpy(rec_buf, table->record[0], m_rec_length);
6280   }
6281   else if (!is_next_same)
6282     error= file->ha_index_next(rec_buf);
6283   else
6284     error= file->ha_index_next_same(rec_buf, m_start_key.key,
6285                                     m_start_key.length);
6286   if (error)
6287   {
6288     if (error == HA_ERR_END_OF_FILE)
6289     {
6290       /* Return next buffered row */
6291       queue_remove(&m_queue, (uint) 0);
6292       if (m_queue.elements)
6293       {
6294          DBUG_PRINT("info", ("Record returned from partition %u (2)",
6295                      m_top_entry));
6296          return_top_record(buf);
6297          table->status= 0;
6298          error= 0;
6299       }
6300     }
6301     DBUG_RETURN(error);
6302   }
6303   if (m_sec_sort_by_rowid)
6304   {
6305     file->position(rec_buf);
6306     memcpy(rec_buf - m_rec_offset + PARTITION_BYTES_IN_POS,
6307            file->ref, file->ref_length);
6308   }
6309   queue_replaced(&m_queue);
6310   return_top_record(buf);
6311   DBUG_PRINT("info", ("Record returned from partition %u", m_top_entry));
6312   DBUG_RETURN(0);
6313 }
6314 
6315 
6316 /*
6317   Common routine to handle index_prev with ordered results
6318 
6319   SYNOPSIS
6320     handle_ordered_prev()
6321     out:buf                       Read row in MySQL Row Format
6322 
6323   RETURN VALUE
6324     HA_ERR_END_OF_FILE            End of scan
6325     0                             Success
6326     other                         Error code
6327 */
6328 
handle_ordered_prev(uchar * buf)6329 int ha_partition::handle_ordered_prev(uchar *buf)
6330 {
6331   int error;
6332   uint part_id= m_top_entry;
6333   uchar *rec_buf= queue_top(&m_queue) + m_rec_offset;
6334   handler *file= m_file[part_id];
6335   DBUG_ENTER("ha_partition::handle_ordered_prev");
6336 
6337   if ((error= file->ha_index_prev(rec_buf)))
6338   {
6339     if (error == HA_ERR_END_OF_FILE)
6340     {
6341       queue_remove(&m_queue, (uint) 0);
6342       if (m_queue.elements)
6343       {
6344 	return_top_record(buf);
6345 	DBUG_PRINT("info", ("Record returned from partition %d (2)",
6346 			    m_top_entry));
6347         error= 0;
6348         table->status= 0;
6349       }
6350     }
6351     DBUG_RETURN(error);
6352   }
6353   if (m_sec_sort_by_rowid)
6354   {
6355     file->position(rec_buf);
6356     memcpy(rec_buf - m_rec_offset + PARTITION_BYTES_IN_POS,
6357            file->ref, file->ref_length);
6358   }
6359   queue_replaced(&m_queue);
6360   return_top_record(buf);
6361   DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry));
6362   DBUG_RETURN(0);
6363 }
6364 
6365 
6366 /****************************************************************************
6367                 MODULE information calls
6368 ****************************************************************************/
6369 
6370 /*
6371   These are all first approximations of the extra, info, scan_time
6372   and read_time calls
6373 */
6374 
6375 /**
6376   Helper function for sorting according to number of rows in descending order.
6377 */
6378 
compare_number_of_records(ha_partition * me,const uint32 * a,const uint32 * b)6379 int ha_partition::compare_number_of_records(ha_partition *me,
6380                                             const uint32 *a,
6381                                             const uint32 *b)
6382 {
6383   handler **file= me->m_file;
6384   /* Note: sorting in descending order! */
6385   if (file[*a]->stats.records > file[*b]->stats.records)
6386     return -1;
6387   if (file[*a]->stats.records < file[*b]->stats.records)
6388     return 1;
6389   return 0;
6390 }
6391 
6392 
6393 /*
6394   General method to gather info from handler
6395 
6396   SYNOPSIS
6397     info()
6398     flag              Specifies what info is requested
6399 
6400   RETURN VALUE
6401     NONE
6402 
6403   DESCRIPTION
6404     ::info() is used to return information to the optimizer.
6405     Currently this table handler doesn't implement most of the fields
6406     really needed. SHOW also makes use of this data
6407     Another note, if your handler doesn't proved exact record count,
6408     you will probably want to have the following in your code:
6409     if (records < 2)
6410       records = 2;
6411     The reason is that the server will optimize for cases of only a single
6412     record. If in a table scan you don't know the number of records
6413     it will probably be better to set records to two so you can return
6414     as many records as you need.
6415 
6416     Along with records a few more variables you may wish to set are:
6417       records
6418       deleted
6419       data_file_length
6420       index_file_length
6421       delete_length
6422       check_time
6423     Take a look at the public variables in handler.h for more information.
6424 
6425     Called in:
6426       filesort.cc
6427       ha_heap.cc
6428       item_sum.cc
6429       opt_sum.cc
6430       sql_delete.cc
6431      sql_delete.cc
6432      sql_derived.cc
6433       sql_select.cc
6434       sql_select.cc
6435       sql_select.cc
6436       sql_select.cc
6437       sql_select.cc
6438       sql_show.cc
6439       sql_show.cc
6440       sql_show.cc
6441       sql_show.cc
6442       sql_table.cc
6443       sql_union.cc
6444       sql_update.cc
6445 
6446     Some flags that are not implemented
6447       HA_STATUS_POS:
6448         This parameter is never used from the MySQL Server. It is checked in a
6449         place in MyISAM so could potentially be used by MyISAM specific
6450         programs.
6451       HA_STATUS_NO_LOCK:
6452       This is declared and often used. It's only used by MyISAM.
6453       It means that MySQL doesn't need the absolute latest statistics
6454       information. This may save the handler from doing internal locks while
6455       retrieving statistics data.
6456 */
6457 
info(uint flag)6458 int ha_partition::info(uint flag)
6459 {
6460   uint no_lock_flag= flag & HA_STATUS_NO_LOCK;
6461   uint extra_var_flag= flag & HA_STATUS_VARIABLE_EXTRA;
6462   DBUG_ENTER("ha_partition::info");
6463 
6464 #ifndef DBUG_OFF
6465   if (bitmap_is_set_all(&(m_part_info->read_partitions)))
6466     DBUG_PRINT("info", ("All partitions are used"));
6467 #endif /* DBUG_OFF */
6468   if (flag & HA_STATUS_AUTO)
6469   {
6470     bool auto_inc_is_first_in_idx= (table_share->next_number_keypart == 0);
6471     DBUG_PRINT("info", ("HA_STATUS_AUTO"));
6472     if (!table->found_next_number_field)
6473       stats.auto_increment_value= 0;
6474     else if (part_share->auto_inc_initialized)
6475     {
6476       lock_auto_increment();
6477       stats.auto_increment_value= part_share->next_auto_inc_val;
6478       unlock_auto_increment();
6479     }
6480     else
6481     {
6482       lock_auto_increment();
6483       /* to avoid two concurrent initializations, check again when locked */
6484       if (part_share->auto_inc_initialized)
6485         stats.auto_increment_value= part_share->next_auto_inc_val;
6486       else
6487       {
6488         /*
6489           The auto-inc mutex in the table_share is locked, so we do not need
6490           to have the handlers locked.
6491           HA_STATUS_NO_LOCK is not checked, since we cannot skip locking
6492           the mutex, because it is initialized.
6493         */
6494         handler *file, **file_array;
6495         ulonglong auto_increment_value= 0;
6496         file_array= m_file;
6497         DBUG_PRINT("info",
6498                    ("checking all partitions for auto_increment_value"));
6499         do
6500         {
6501           file= *file_array;
6502           file->info(HA_STATUS_AUTO | no_lock_flag);
6503           set_if_bigger(auto_increment_value,
6504                         file->stats.auto_increment_value);
6505         } while (*(++file_array));
6506 
6507         DBUG_ASSERT(auto_increment_value);
6508         stats.auto_increment_value= auto_increment_value;
6509         if (auto_inc_is_first_in_idx)
6510         {
6511           set_if_bigger(part_share->next_auto_inc_val,
6512                         auto_increment_value);
6513           part_share->auto_inc_initialized= true;
6514           DBUG_PRINT("info", ("initializing next_auto_inc_val to %lu",
6515                        (ulong) part_share->next_auto_inc_val));
6516         }
6517       }
6518       unlock_auto_increment();
6519     }
6520   }
6521   if (flag & HA_STATUS_VARIABLE)
6522   {
6523     uint i;
6524     DBUG_PRINT("info", ("HA_STATUS_VARIABLE"));
6525     /*
6526       Calculates statistical variables
6527       records:           Estimate of number records in table
6528       We report sum (always at least 2 if not empty)
6529       deleted:           Estimate of number holes in the table due to
6530       deletes
6531       We report sum
6532       data_file_length:  Length of data file, in principle bytes in table
6533       We report sum
6534       index_file_length: Length of index file, in principle bytes in
6535       indexes in the table
6536       We report sum
6537       delete_length: Length of free space easily used by new records in table
6538       We report sum
6539       mean_record_length:Mean record length in the table
6540       We calculate this
6541       check_time:        Time of last check (only applicable to MyISAM)
6542       We report last time of all underlying handlers
6543     */
6544     handler *file;
6545     stats.records= 0;
6546     stats.deleted= 0;
6547     stats.data_file_length= 0;
6548     stats.index_file_length= 0;
6549     stats.check_time= 0;
6550     stats.delete_length= 0;
6551     for (i= bitmap_get_first_set(&m_part_info->read_partitions);
6552          i < m_tot_parts;
6553          i= bitmap_get_next_set(&m_part_info->read_partitions, i))
6554     {
6555       file= m_file[i];
6556       file->info(HA_STATUS_VARIABLE | no_lock_flag | extra_var_flag);
6557       stats.records+= file->stats.records;
6558       stats.deleted+= file->stats.deleted;
6559       stats.data_file_length+= file->stats.data_file_length;
6560       stats.index_file_length+= file->stats.index_file_length;
6561       stats.delete_length+= file->stats.delete_length;
6562       if (file->stats.check_time > stats.check_time)
6563         stats.check_time= file->stats.check_time;
6564     }
6565     if (stats.records && stats.records < 2 &&
6566         !(m_file[0]->ha_table_flags() & HA_STATS_RECORDS_IS_EXACT))
6567       stats.records= 2;
6568     if (stats.records > 0)
6569       stats.mean_rec_length= (ulong) (stats.data_file_length / stats.records);
6570     else
6571       stats.mean_rec_length= 0;
6572   }
6573   if (flag & HA_STATUS_CONST)
6574   {
6575     DBUG_PRINT("info", ("HA_STATUS_CONST"));
6576     /*
6577       Recalculate loads of constant variables. MyISAM also sets things
6578       directly on the table share object.
6579 
6580       Check whether this should be fixed since handlers should not
6581       change things directly on the table object.
6582 
6583       Monty comment: This should NOT be changed!  It's the handlers
6584       responsibility to correct table->s->keys_xxxx information if keys
6585       have been disabled.
6586 
6587       The most important parameters set here is records per key on
6588       all indexes. block_size and primar key ref_length.
6589 
6590       For each index there is an array of rec_per_key.
6591       As an example if we have an index with three attributes a,b and c
6592       we will have an array of 3 rec_per_key.
6593       rec_per_key[0] is an estimate of number of records divided by
6594       number of unique values of the field a.
6595       rec_per_key[1] is an estimate of the number of records divided
6596       by the number of unique combinations of the fields a and b.
6597       rec_per_key[2] is an estimate of the number of records divided
6598       by the number of unique combinations of the fields a,b and c.
6599 
6600       Many handlers only set the value of rec_per_key when all fields
6601       are bound (rec_per_key[2] in the example above).
6602 
6603       If the handler doesn't support statistics, it should set all of the
6604       above to 0.
6605 
6606       We first scans through all partitions to get the one holding most rows.
6607       We will then allow the handler with the most rows to set
6608       the rec_per_key and use this as an estimate on the total table.
6609 
6610       max_data_file_length:     Maximum data file length
6611       We ignore it, is only used in
6612       SHOW TABLE STATUS
6613       max_index_file_length:    Maximum index file length
6614       We ignore it since it is never used
6615       block_size:               Block size used
6616       We set it to the value of the first handler
6617       ref_length:               We set this to the value calculated
6618       and stored in local object
6619       create_time:              Creation time of table
6620 
6621       So we calculate these constants by using the variables from the
6622       handler with most rows.
6623     */
6624     handler *file, **file_array;
6625     ulonglong max_records= 0;
6626     uint32 i= 0;
6627     uint32 handler_instance= 0;
6628 
6629     file_array= m_file;
6630     do
6631     {
6632       file= *file_array;
6633       /* Get variables if not already done */
6634       if (!(flag & HA_STATUS_VARIABLE) ||
6635           !bitmap_is_set(&(m_part_info->read_partitions),
6636                          (file_array - m_file)))
6637         file->info(HA_STATUS_VARIABLE | no_lock_flag | extra_var_flag);
6638       if (file->stats.records > max_records)
6639       {
6640         max_records= file->stats.records;
6641         handler_instance= i;
6642       }
6643       i++;
6644     } while (*(++file_array));
6645     /*
6646       Sort the array of part_ids by number of records in
6647       in descending order.
6648     */
6649     my_qsort2((void*) m_part_ids_sorted_by_num_of_records,
6650               m_tot_parts,
6651               sizeof(uint32),
6652               (qsort2_cmp) compare_number_of_records,
6653               this);
6654 
6655     file= m_file[handler_instance];
6656     file->info(HA_STATUS_CONST | no_lock_flag);
6657     stats.block_size= file->stats.block_size;
6658     stats.create_time= file->stats.create_time;
6659     ref_length= m_ref_length;
6660   }
6661   if (flag & HA_STATUS_ERRKEY)
6662   {
6663     handler *file= m_file[m_last_part];
6664     DBUG_PRINT("info", ("info: HA_STATUS_ERRKEY"));
6665     /*
6666       This flag is used to get index number of the unique index that
6667       reported duplicate key
6668       We will report the errkey on the last handler used and ignore the rest
6669       Note: all engines does not support HA_STATUS_ERRKEY, so set errkey.
6670     */
6671     file->errkey= errkey;
6672     file->info(HA_STATUS_ERRKEY | no_lock_flag);
6673     errkey= file->errkey;
6674   }
6675   if (flag & HA_STATUS_TIME)
6676   {
6677     handler *file, **file_array;
6678     DBUG_PRINT("info", ("info: HA_STATUS_TIME"));
6679     /*
6680       This flag is used to set the latest update time of the table.
6681       Used by SHOW commands
6682       We will report the maximum of these times
6683     */
6684     stats.update_time= 0;
6685     file_array= m_file;
6686     do
6687     {
6688       file= *file_array;
6689       file->info(HA_STATUS_TIME | no_lock_flag);
6690       if (file->stats.update_time > stats.update_time)
6691 	stats.update_time= file->stats.update_time;
6692     } while (*(++file_array));
6693   }
6694   DBUG_RETURN(0);
6695 }
6696 
6697 
get_dynamic_partition_info(PARTITION_STATS * stat_info,uint part_id)6698 void ha_partition::get_dynamic_partition_info(PARTITION_STATS *stat_info,
6699                                               uint part_id)
6700 {
6701   handler *file= m_file[part_id];
6702   DBUG_ASSERT(bitmap_is_set(&(m_part_info->read_partitions), part_id));
6703   file->info(HA_STATUS_TIME | HA_STATUS_VARIABLE |
6704              HA_STATUS_VARIABLE_EXTRA | HA_STATUS_NO_LOCK);
6705 
6706   stat_info->records=              file->stats.records;
6707   stat_info->mean_rec_length=      file->stats.mean_rec_length;
6708   stat_info->data_file_length=     file->stats.data_file_length;
6709   stat_info->max_data_file_length= file->stats.max_data_file_length;
6710   stat_info->index_file_length=    file->stats.index_file_length;
6711   stat_info->delete_length=        file->stats.delete_length;
6712   stat_info->create_time=          file->stats.create_time;
6713   stat_info->update_time=          file->stats.update_time;
6714   stat_info->check_time=           file->stats.check_time;
6715   stat_info->check_sum= 0;
6716   if (file->ha_table_flags() & HA_HAS_CHECKSUM)
6717     stat_info->check_sum= file->checksum();
6718   return;
6719 }
6720 
6721 
6722 /**
6723   General function to prepare handler for certain behavior.
6724 
6725   @param[in]    operation       operation to execute
6726 
6727   @return       status
6728     @retval     0               success
6729     @retval     >0              error code
6730 
6731   @detail
6732 
6733   extra() is called whenever the server wishes to send a hint to
6734   the storage engine. The MyISAM engine implements the most hints.
6735 
6736   We divide the parameters into the following categories:
6737   1) Operations used by most handlers
6738   2) Operations used by some non-MyISAM handlers
6739   3) Operations used only by MyISAM
6740   4) Operations only used by temporary tables for query processing
6741   5) Operations only used by MyISAM internally
6742   6) Operations not used at all
6743   7) Operations only used by federated tables for query processing
6744   8) Operations only used by NDB
6745   9) Operations only used by MERGE
6746   10) Operations only used by InnoDB
6747   11) Operations only used by partitioning
6748 
6749   The partition handler need to handle category 1), 2), 3), 10) and 11).
6750 
6751   1) Operations used by most handlers
6752   -----------------------------------
6753   HA_EXTRA_RESET:
6754     This option is used by most handlers and it resets the handler state
6755     to the same state as after an open call. This includes releasing
6756     any READ CACHE or WRITE CACHE or other internal buffer used.
6757 
6758     It is called from the reset method in the handler interface. There are
6759     three instances where this is called.
6760     1) After completing a INSERT ... SELECT ... query the handler for the
6761        table inserted into is reset
6762     2) It is called from close_thread_table which in turn is called from
6763        close_thread_tables except in the case where the tables are locked
6764        in which case ha_commit_stmt is called instead.
6765        It is only called from here if refresh_version hasn't changed and the
6766        table is not an old table when calling close_thread_table.
6767        close_thread_tables is called from many places as a general clean up
6768        function after completing a query.
6769     3) It is called when deleting the QUICK_RANGE_SELECT object if the
6770        QUICK_RANGE_SELECT object had its own handler object. It is called
6771        immediatley before close of this local handler object.
6772   HA_EXTRA_KEYREAD:
6773   HA_EXTRA_NO_KEYREAD:
6774     These parameters are used to provide an optimisation hint to the handler.
6775     If HA_EXTRA_KEYREAD is set it is enough to read the index fields, for
6776     many handlers this means that the index-only scans can be used and it
6777     is not necessary to use the real records to satisfy this part of the
6778     query. Index-only scans is a very important optimisation for disk-based
6779     indexes. For main-memory indexes most indexes contain a reference to the
6780     record and thus KEYREAD only says that it is enough to read key fields.
6781     HA_EXTRA_NO_KEYREAD disables this for the handler, also HA_EXTRA_RESET
6782     will disable this option.
6783     The handler will set HA_KEYREAD_ONLY in its table flags to indicate this
6784     feature is supported.
6785   HA_EXTRA_FLUSH:
6786     Indication to flush tables to disk, is supposed to be used to
6787     ensure disk based tables are flushed at end of query execution.
6788     Currently is never used.
6789   HA_EXTRA_PREPARE_FOR_RENAME:
6790     Informs the handler we are about to attempt a rename of the table.
6791     For handlers that have share open files (MyISAM key-file and
6792     Archive writer) they must close the files before rename is possible
6793     on Windows.
6794   HA_EXTRA_FORCE_REOPEN:
6795     Only used by MyISAM and Archive, called when altering table,
6796     closing tables to enforce a reopen of the table files.
6797 
6798   2) Operations used by some non-MyISAM handlers
6799   ----------------------------------------------
6800   HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
6801     This is a strictly InnoDB feature that is more or less undocumented.
6802     When it is activated InnoDB copies field by field from its fetch
6803     cache instead of all fields in one memcpy. Have no idea what the
6804     purpose of this is.
6805     Cut from include/my_base.h:
6806     When using HA_EXTRA_KEYREAD, overwrite only key member fields and keep
6807     other fields intact. When this is off (by default) InnoDB will use memcpy
6808     to overwrite entire row.
6809   HA_EXTRA_IGNORE_DUP_KEY:
6810   HA_EXTRA_NO_IGNORE_DUP_KEY:
6811     Informs the handler to we will not stop the transaction if we get an
6812     duplicate key errors during insert/upate.
6813     Always called in pair, triggered by INSERT IGNORE and other similar
6814     SQL constructs.
6815     Not used by MyISAM.
6816 
6817   3) Operations used only by MyISAM
6818   ---------------------------------
6819   HA_EXTRA_NORMAL:
6820     Only used in MyISAM to reset quick mode, not implemented by any other
6821     handler. Quick mode is also reset in MyISAM by HA_EXTRA_RESET.
6822 
6823     It is called after completing a successful DELETE query if the QUICK
6824     option is set.
6825 
6826   HA_EXTRA_QUICK:
6827     When the user does DELETE QUICK FROM table where-clause; this extra
6828     option is called before the delete query is performed and
6829     HA_EXTRA_NORMAL is called after the delete query is completed.
6830     Temporary tables used internally in MySQL always set this option
6831 
6832     The meaning of quick mode is that when deleting in a B-tree no merging
6833     of leafs is performed. This is a common method and many large DBMS's
6834     actually only support this quick mode since it is very difficult to
6835     merge leaves in a tree used by many threads concurrently.
6836 
6837   HA_EXTRA_CACHE:
6838     This flag is usually set with extra_opt along with a cache size.
6839     The size of this buffer is set by the user variable
6840     record_buffer_size. The value of this cache size is the amount of
6841     data read from disk in each fetch when performing a table scan.
6842     This means that before scanning a table it is normal to call
6843     extra with HA_EXTRA_CACHE and when the scan is completed to call
6844     HA_EXTRA_NO_CACHE to release the cache memory.
6845 
6846     Some special care is taken when using this extra parameter since there
6847     could be a write ongoing on the table in the same statement. In this
6848     one has to take special care since there might be a WRITE CACHE as
6849     well. HA_EXTRA_CACHE specifies using a READ CACHE and using
6850     READ CACHE and WRITE CACHE at the same time is not possible.
6851 
6852     Only MyISAM currently use this option.
6853 
6854     It is set when doing full table scans using rr_sequential and
6855     reset when completing such a scan with end_read_record
6856     (resetting means calling extra with HA_EXTRA_NO_CACHE).
6857 
6858     It is set in filesort.cc for MyISAM internal tables and it is set in
6859     a multi-update where HA_EXTRA_CACHE is called on a temporary result
6860     table and after that ha_rnd_init(0) on table to be updated
6861     and immediately after that HA_EXTRA_NO_CACHE on table to be updated.
6862 
6863     Apart from that it is always used from init_read_record but not when
6864     used from UPDATE statements. It is not used from DELETE statements
6865     with ORDER BY and LIMIT but it is used in normal scan loop in DELETE
6866     statements. The reason here is that DELETE's in MyISAM doesn't move
6867     existings data rows.
6868 
6869     It is also set in copy_data_between_tables when scanning the old table
6870     to copy over to the new table.
6871     And it is set in join_init_read_record where quick objects are used
6872     to perform a scan on the table. In this case the full table scan can
6873     even be performed multiple times as part of the nested loop join.
6874 
6875     For purposes of the partition handler it is obviously necessary to have
6876     special treatment of this extra call. If we would simply pass this
6877     extra call down to each handler we would allocate
6878     cache size * no of partitions amount of memory and this is not
6879     necessary since we will only scan one partition at a time when doing
6880     full table scans.
6881 
6882     Thus we treat it by first checking whether we have MyISAM handlers in
6883     the table, if not we simply ignore the call and if we have we will
6884     record the call but will not call any underlying handler yet. Then
6885     when performing the sequential scan we will check this recorded value
6886     and call extra_opt whenever we start scanning a new partition.
6887 
6888   HA_EXTRA_NO_CACHE:
6889     When performing a UNION SELECT HA_EXTRA_NO_CACHE is called from the
6890     flush method in the select_union class.
6891     It is used to some extent when insert delayed inserts.
6892     See HA_EXTRA_RESET_STATE for use in conjunction with delete_all_rows().
6893 
6894     It should be ok to call HA_EXTRA_NO_CACHE on all underlying handlers
6895     if they are MyISAM handlers. Other handlers we can ignore the call
6896     for. If no cache is in use they will quickly return after finding
6897     this out. And we also ensure that all caches are disabled and no one
6898     is left by mistake.
6899     In the future this call will probably be deleted and we will instead call
6900     ::reset();
6901 
6902   HA_EXTRA_WRITE_CACHE:
6903     See above, called from various places. It is mostly used when we
6904     do INSERT ... SELECT
6905     No special handling to save cache space is developed currently.
6906 
6907   HA_EXTRA_PREPARE_FOR_UPDATE:
6908     This is called as part of a multi-table update. When the table to be
6909     updated is also scanned then this informs MyISAM handler to drop any
6910     caches if dynamic records are used (fixed size records do not care
6911     about this call). We pass this along to the first partition to scan, and
6912     flag that it is to be called after HA_EXTRA_CACHE when moving to the next
6913     partition to scan.
6914 
6915   HA_EXTRA_PREPARE_FOR_DROP:
6916     Only used by MyISAM, called in preparation for a DROP TABLE.
6917     It's used mostly by Windows that cannot handle dropping an open file.
6918     On other platforms it has the same effect as HA_EXTRA_FORCE_REOPEN.
6919 
6920   HA_EXTRA_READCHECK:
6921   HA_EXTRA_NO_READCHECK:
6922     Only one call to HA_EXTRA_NO_READCHECK from ha_open where it says that
6923     this is not needed in SQL. The reason for this call is that MyISAM sets
6924     the READ_CHECK_USED in the open call so the call is needed for MyISAM
6925     to reset this feature.
6926     The idea with this parameter was to inform of doing/not doing a read
6927     check before applying an update. Since SQL always performs a read before
6928     applying the update No Read Check is needed in MyISAM as well.
6929 
6930     This is a cut from Docs/myisam.txt
6931      Sometimes you might want to force an update without checking whether
6932      another user has changed the record since you last read it. This is
6933      somewhat dangerous, so it should ideally not be used. That can be
6934      accomplished by wrapping the mi_update() call in two calls to mi_extra(),
6935      using these functions:
6936      HA_EXTRA_NO_READCHECK=5                 No readcheck on update
6937      HA_EXTRA_READCHECK=6                    Use readcheck (def)
6938 
6939 
6940   4) Operations only used by temporary tables for query processing
6941   ----------------------------------------------------------------
6942   HA_EXTRA_RESET_STATE:
6943     Same as reset() except that buffers are not released. If there is
6944     a READ CACHE it is reinit'ed. A cache is reinit'ed to restart reading
6945     or to change type of cache between READ CACHE and WRITE CACHE.
6946 
6947     This extra function is always called immediately before calling
6948     delete_all_rows on the handler for temporary tables.
6949     There are cases however when HA_EXTRA_RESET_STATE isn't called in
6950     a similar case for a temporary table in sql_union.cc and in two other
6951     cases HA_EXTRA_NO_CACHE is called before and HA_EXTRA_WRITE_CACHE
6952     called afterwards.
6953     The case with HA_EXTRA_NO_CACHE and HA_EXTRA_WRITE_CACHE means
6954     disable caching, delete all rows and enable WRITE CACHE. This is
6955     used for temporary tables containing distinct sums and a
6956     functional group.
6957 
6958     The only case that delete_all_rows is called on non-temporary tables
6959     is in sql_delete.cc when DELETE FROM table; is called by a user.
6960     In this case no special extra calls are performed before or after this
6961     call.
6962 
6963     The partition handler should not need to bother about this one. It
6964     should never be called.
6965 
6966   HA_EXTRA_NO_ROWS:
6967     Don't insert rows indication to HEAP and MyISAM, only used by temporary
6968     tables used in query processing.
6969     Not handled by partition handler.
6970 
6971   5) Operations only used by MyISAM internally
6972   --------------------------------------------
6973   HA_EXTRA_REINIT_CACHE:
6974     This call reinitializes the READ CACHE described above if there is one
6975     and otherwise the call is ignored.
6976 
6977     We can thus safely call it on all underlying handlers if they are
6978     MyISAM handlers. It is however never called so we don't handle it at all.
6979   HA_EXTRA_FLUSH_CACHE:
6980     Flush WRITE CACHE in MyISAM. It is only from one place in the code.
6981     This is in sql_insert.cc where it is called if the table_flags doesn't
6982     contain HA_DUPLICATE_POS. The only handler having the HA_DUPLICATE_POS
6983     set is the MyISAM handler and so the only handler not receiving this
6984     call is MyISAM.
6985     Thus in effect this call is called but never used. Could be removed
6986     from sql_insert.cc
6987   HA_EXTRA_NO_USER_CHANGE:
6988     Only used by MyISAM, never called.
6989     Simulates lock_type as locked.
6990   HA_EXTRA_WAIT_LOCK:
6991   HA_EXTRA_WAIT_NOLOCK:
6992     Only used by MyISAM, called from MyISAM handler but never from server
6993     code on top of the handler.
6994     Sets lock_wait on/off
6995   HA_EXTRA_NO_KEYS:
6996     Only used MyISAM, only used internally in MyISAM handler, never called
6997     from server level.
6998   HA_EXTRA_KEYREAD_CHANGE_POS:
6999   HA_EXTRA_REMEMBER_POS:
7000   HA_EXTRA_RESTORE_POS:
7001   HA_EXTRA_PRELOAD_BUFFER_SIZE:
7002   HA_EXTRA_CHANGE_KEY_TO_DUP:
7003   HA_EXTRA_CHANGE_KEY_TO_UNIQUE:
7004     Only used by MyISAM, never called.
7005 
7006   6) Operations not used at all
7007   -----------------------------
7008   HA_EXTRA_KEY_CACHE:
7009   HA_EXTRA_NO_KEY_CACHE:
7010     This parameters are no longer used and could be removed.
7011 
7012   7) Operations only used by federated tables for query processing
7013   ----------------------------------------------------------------
7014   HA_EXTRA_INSERT_WITH_UPDATE:
7015     Inform handler that an "INSERT...ON DUPLICATE KEY UPDATE" will be
7016     executed. This condition is unset by HA_EXTRA_NO_IGNORE_DUP_KEY.
7017 
7018   8) Operations only used by NDB
7019   ------------------------------
7020   HA_EXTRA_DELETE_CANNOT_BATCH:
7021   HA_EXTRA_UPDATE_CANNOT_BATCH:
7022     Inform handler that delete_row()/update_row() cannot batch deletes/updates
7023     and should perform them immediately. This may be needed when table has
7024     AFTER DELETE/UPDATE triggers which access to subject table.
7025     These flags are reset by the handler::extra(HA_EXTRA_RESET) call.
7026 
7027   9) Operations only used by MERGE
7028   ------------------------------
7029   HA_EXTRA_ADD_CHILDREN_LIST:
7030   HA_EXTRA_ATTACH_CHILDREN:
7031   HA_EXTRA_IS_ATTACHED_CHILDREN:
7032   HA_EXTRA_DETACH_CHILDREN:
7033     Special actions for MERGE tables. Ignore.
7034 
7035   10) Operations only used by InnoDB
7036   ----------------------------------
7037   HA_EXTRA_EXPORT:
7038     Prepare table for export
7039     (e.g. quiesce the table and write table metadata).
7040 
7041   11) Operations only used by partitioning
7042   ------------------------------
7043   HA_EXTRA_SECONDARY_SORT_ROWID:
7044     INDEX_MERGE type of execution, needs to do secondary sort by
7045     ROWID (handler::ref).
7046 */
7047 
extra(enum ha_extra_function operation)7048 int ha_partition::extra(enum ha_extra_function operation)
7049 {
7050   DBUG_ENTER("ha_partition:extra");
7051   DBUG_PRINT("info", ("operation: %d", (int) operation));
7052 
7053   switch (operation) {
7054     /* Category 1), used by most handlers */
7055   case HA_EXTRA_KEYREAD:
7056   case HA_EXTRA_NO_KEYREAD:
7057   case HA_EXTRA_FLUSH:
7058     DBUG_RETURN(loop_extra(operation));
7059   case HA_EXTRA_PREPARE_FOR_RENAME:
7060   case HA_EXTRA_FORCE_REOPEN:
7061     DBUG_RETURN(loop_extra_alter(operation));
7062     break;
7063 
7064     /* Category 2), used by non-MyISAM handlers */
7065   case HA_EXTRA_IGNORE_DUP_KEY:
7066   case HA_EXTRA_NO_IGNORE_DUP_KEY:
7067   case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
7068   {
7069     if (!m_myisam)
7070       DBUG_RETURN(loop_extra(operation));
7071     break;
7072   }
7073 
7074   /* Category 3), used by MyISAM handlers */
7075   case HA_EXTRA_PREPARE_FOR_UPDATE:
7076     /*
7077       Needs to be run on the first partition in the range now, and
7078       later in late_extra_cache, when switching to a new partition to scan.
7079     */
7080     m_extra_prepare_for_update= TRUE;
7081     if (m_part_spec.start_part != NO_CURRENT_PART_ID)
7082     {
7083       if (!m_extra_cache)
7084         m_extra_cache_part_id= m_part_spec.start_part;
7085       DBUG_ASSERT(m_extra_cache_part_id == m_part_spec.start_part);
7086       (void) m_file[m_part_spec.start_part]->extra(HA_EXTRA_PREPARE_FOR_UPDATE);
7087     }
7088     break;
7089   case HA_EXTRA_NORMAL:
7090   case HA_EXTRA_QUICK:
7091   case HA_EXTRA_PREPARE_FOR_DROP:
7092   case HA_EXTRA_FLUSH_CACHE:
7093   {
7094     if (m_myisam)
7095       DBUG_RETURN(loop_extra(operation));
7096     break;
7097   }
7098   case HA_EXTRA_NO_READCHECK:
7099   {
7100     /*
7101       This is only done as a part of ha_open, which is also used in
7102       ha_partition::open, so no need to do anything.
7103     */
7104     break;
7105   }
7106   case HA_EXTRA_CACHE:
7107   {
7108     prepare_extra_cache(0);
7109     break;
7110   }
7111   case HA_EXTRA_NO_CACHE:
7112   {
7113     int ret= 0;
7114     if (m_extra_cache_part_id != NO_CURRENT_PART_ID)
7115       ret= m_file[m_extra_cache_part_id]->extra(HA_EXTRA_NO_CACHE);
7116     m_extra_cache= FALSE;
7117     m_extra_cache_size= 0;
7118     m_extra_prepare_for_update= FALSE;
7119     m_extra_cache_part_id= NO_CURRENT_PART_ID;
7120     DBUG_RETURN(ret);
7121   }
7122   case HA_EXTRA_WRITE_CACHE:
7123   {
7124     m_extra_cache= FALSE;
7125     m_extra_cache_size= 0;
7126     m_extra_prepare_for_update= FALSE;
7127     m_extra_cache_part_id= NO_CURRENT_PART_ID;
7128     DBUG_RETURN(loop_extra(operation));
7129   }
7130   case HA_EXTRA_IGNORE_NO_KEY:
7131   case HA_EXTRA_NO_IGNORE_NO_KEY:
7132   {
7133     /*
7134       Ignore as these are specific to NDB for handling
7135       idempotency
7136      */
7137     break;
7138   }
7139   case HA_EXTRA_WRITE_CAN_REPLACE:
7140   case HA_EXTRA_WRITE_CANNOT_REPLACE:
7141   {
7142     /*
7143       Informs handler that write_row() can replace rows which conflict
7144       with row being inserted by PK/unique key without reporting error
7145       to the SQL-layer.
7146 
7147       This optimization is not safe for partitioned table in general case
7148       since we may have to put new version of row into partition which is
7149       different from partition in which old version resides (for example
7150       when we partition by non-PK column or by some column which is not
7151       part of unique key which were violated).
7152       And since NDB which is the only engine at the moment that supports
7153       this optimization handles partitioning on its own we simple disable
7154       it here. (BTW for NDB this optimization is safe since it supports
7155       only KEY partitioning and won't use this optimization for tables
7156       which have additional unique constraints).
7157     */
7158     break;
7159   }
7160     /* Category 7), used by federated handlers */
7161   case HA_EXTRA_INSERT_WITH_UPDATE:
7162     DBUG_RETURN(loop_extra(operation));
7163     /* Category 8) Operations only used by NDB */
7164   case HA_EXTRA_DELETE_CANNOT_BATCH:
7165   case HA_EXTRA_UPDATE_CANNOT_BATCH:
7166   {
7167     /* Currently only NDB use the *_CANNOT_BATCH */
7168     break;
7169   }
7170     /* Category 9) Operations only used by MERGE */
7171   case HA_EXTRA_ADD_CHILDREN_LIST:
7172   case HA_EXTRA_ATTACH_CHILDREN:
7173   case HA_EXTRA_IS_ATTACHED_CHILDREN:
7174   case HA_EXTRA_DETACH_CHILDREN:
7175   {
7176     /* Special actions for MERGE tables. Ignore. */
7177     break;
7178   }
7179   /*
7180     http://dev.mysql.com/doc/refman/5.1/en/partitioning-limitations.html
7181     says we no longer support logging to partitioned tables, so we fail
7182     here.
7183   */
7184   case HA_EXTRA_MARK_AS_LOG_TABLE:
7185     DBUG_RETURN(ER_UNSUPORTED_LOG_ENGINE);
7186     /* Category 10), used by InnoDB handlers */
7187   case HA_EXTRA_EXPORT:
7188     DBUG_RETURN(loop_extra(operation));
7189     /* Category 11) Operations only used by partitioning. */
7190   case HA_EXTRA_SECONDARY_SORT_ROWID:
7191   {
7192     /* index_init(sorted=true) must have been called! */
7193     DBUG_ASSERT(m_ordered);
7194     DBUG_ASSERT(m_ordered_rec_buffer);
7195     /* No index_read call must have been done! */
7196     DBUG_ASSERT(m_queue.elements == 0);
7197     /* If not PK is set as secondary sort, do secondary sort by rowid/ref. */
7198     if (!m_curr_key_info[1])
7199     {
7200       m_sec_sort_by_rowid= true;
7201       queue_set_compare(&m_queue, key_and_ref_cmp);
7202     }
7203     break;
7204   }
7205   default:
7206   {
7207     /* Temporary crash to discover what is wrong */
7208     DBUG_ASSERT(0);
7209     break;
7210   }
7211   }
7212   DBUG_RETURN(0);
7213 }
7214 
7215 
7216 /**
7217   Special extra call to reset extra parameters
7218 
7219   @return Operation status.
7220     @retval >0 Error code
7221     @retval 0  Success
7222 
7223   @note Called at end of each statement to reset buffers.
7224   To avoid excessive calls, the m_partitions_to_reset bitmap keep records
7225   of which partitions that have been used in extra(), external_lock() or
7226   start_stmt() and is needed to be called.
7227 */
7228 
reset(void)7229 int ha_partition::reset(void)
7230 {
7231   int result= 0;
7232   int tmp;
7233   uint i;
7234   DBUG_ENTER("ha_partition::reset");
7235 
7236   for (i= bitmap_get_first_set(&m_partitions_to_reset);
7237        i < m_tot_parts;
7238        i= bitmap_get_next_set(&m_partitions_to_reset, i))
7239   {
7240     if ((tmp= m_file[i]->ha_reset()))
7241       result= tmp;
7242   }
7243   bitmap_clear_all(&m_partitions_to_reset);
7244   DBUG_RETURN(result);
7245 }
7246 
7247 /*
7248   Special extra method for HA_EXTRA_CACHE with cachesize as extra parameter
7249 
7250   SYNOPSIS
7251     extra_opt()
7252     operation                      Must be HA_EXTRA_CACHE
7253     cachesize                      Size of cache in full table scan
7254 
7255   RETURN VALUE
7256     >0                   Error code
7257     0                    Success
7258 */
7259 
extra_opt(enum ha_extra_function operation,ulong cachesize)7260 int ha_partition::extra_opt(enum ha_extra_function operation, ulong cachesize)
7261 {
7262   DBUG_ENTER("ha_partition::extra_opt()");
7263 
7264   DBUG_ASSERT(HA_EXTRA_CACHE == operation);
7265   prepare_extra_cache(cachesize);
7266   DBUG_RETURN(0);
7267 }
7268 
7269 
7270 /*
7271   Call extra on handler with HA_EXTRA_CACHE and cachesize
7272 
7273   SYNOPSIS
7274     prepare_extra_cache()
7275     cachesize                Size of cache for full table scan
7276 
7277   RETURN VALUE
7278     NONE
7279 */
7280 
prepare_extra_cache(uint cachesize)7281 void ha_partition::prepare_extra_cache(uint cachesize)
7282 {
7283   DBUG_ENTER("ha_partition::prepare_extra_cache()");
7284   DBUG_PRINT("info", ("cachesize %u", cachesize));
7285 
7286   m_extra_cache= TRUE;
7287   m_extra_cache_size= cachesize;
7288   if (m_part_spec.start_part != NO_CURRENT_PART_ID)
7289   {
7290     DBUG_ASSERT(bitmap_is_set(&m_partitions_to_reset,
7291                               m_part_spec.start_part));
7292     bitmap_set_bit(&m_partitions_to_reset, m_part_spec.start_part);
7293     late_extra_cache(m_part_spec.start_part);
7294   }
7295   DBUG_VOID_RETURN;
7296 }
7297 
7298 
7299 /**
7300   Prepares our new and reorged handlers for rename or delete.
7301 
7302   @param operation Operation to forward
7303 
7304   @return Operation status
7305     @retval 0  Success
7306     @retval !0 Error
7307 */
7308 
loop_extra_alter(enum ha_extra_function operation)7309 int ha_partition::loop_extra_alter(enum ha_extra_function operation)
7310 {
7311   int result= 0, tmp;
7312   handler **file;
7313   DBUG_ENTER("ha_partition::loop_extra_alter()");
7314   DBUG_ASSERT(operation == HA_EXTRA_PREPARE_FOR_RENAME ||
7315               operation == HA_EXTRA_FORCE_REOPEN);
7316 
7317   if (m_new_file != NULL)
7318   {
7319     for (file= m_new_file; *file; file++)
7320       if ((tmp= (*file)->extra(operation)))
7321         result= tmp;
7322   }
7323   if (m_reorged_file != NULL)
7324   {
7325     for (file= m_reorged_file; *file; file++)
7326       if ((tmp= (*file)->extra(operation)))
7327         result= tmp;
7328   }
7329   if ((tmp= loop_extra(operation)))
7330     result= tmp;
7331   DBUG_RETURN(result);
7332 }
7333 
7334 /*
7335   Call extra on all partitions
7336 
7337   SYNOPSIS
7338     loop_extra()
7339     operation             extra operation type
7340 
7341   RETURN VALUE
7342     >0                    Error code
7343     0                     Success
7344 */
7345 
loop_extra(enum ha_extra_function operation)7346 int ha_partition::loop_extra(enum ha_extra_function operation)
7347 {
7348   int result= 0, tmp;
7349   uint i;
7350   DBUG_ENTER("ha_partition::loop_extra()");
7351 
7352   for (i= bitmap_get_first_set(&m_part_info->lock_partitions);
7353        i < m_tot_parts;
7354        i= bitmap_get_next_set(&m_part_info->lock_partitions, i))
7355   {
7356     if ((tmp= m_file[i]->extra(operation)))
7357       result= tmp;
7358   }
7359   /* Add all used partitions to be called in reset(). */
7360   bitmap_union(&m_partitions_to_reset, &m_part_info->lock_partitions);
7361   DBUG_RETURN(result);
7362 }
7363 
7364 
7365 /*
7366   Call extra(HA_EXTRA_CACHE) on next partition_id
7367 
7368   SYNOPSIS
7369     late_extra_cache()
7370     partition_id               Partition id to call extra on
7371 
7372   RETURN VALUE
7373     NONE
7374 */
7375 
late_extra_cache(uint partition_id)7376 void ha_partition::late_extra_cache(uint partition_id)
7377 {
7378   handler *file;
7379   DBUG_ENTER("ha_partition::late_extra_cache");
7380   DBUG_PRINT("info", ("extra_cache %u prepare %u partid %u size %u",
7381                       m_extra_cache, m_extra_prepare_for_update,
7382                       partition_id, m_extra_cache_size));
7383 
7384   if (!m_extra_cache && !m_extra_prepare_for_update)
7385     DBUG_VOID_RETURN;
7386   file= m_file[partition_id];
7387   if (m_extra_cache)
7388   {
7389     if (m_extra_cache_size == 0)
7390       (void) file->extra(HA_EXTRA_CACHE);
7391     else
7392       (void) file->extra_opt(HA_EXTRA_CACHE, m_extra_cache_size);
7393   }
7394   if (m_extra_prepare_for_update)
7395   {
7396     (void) file->extra(HA_EXTRA_PREPARE_FOR_UPDATE);
7397   }
7398   m_extra_cache_part_id= partition_id;
7399   DBUG_VOID_RETURN;
7400 }
7401 
7402 
7403 /*
7404   Call extra(HA_EXTRA_NO_CACHE) on next partition_id
7405 
7406   SYNOPSIS
7407     late_extra_no_cache()
7408     partition_id               Partition id to call extra on
7409 
7410   RETURN VALUE
7411     NONE
7412 */
7413 
late_extra_no_cache(uint partition_id)7414 void ha_partition::late_extra_no_cache(uint partition_id)
7415 {
7416   handler *file;
7417   DBUG_ENTER("ha_partition::late_extra_no_cache");
7418 
7419   if (!m_extra_cache && !m_extra_prepare_for_update)
7420     DBUG_VOID_RETURN;
7421   file= m_file[partition_id];
7422   (void) file->extra(HA_EXTRA_NO_CACHE);
7423   DBUG_ASSERT(partition_id == m_extra_cache_part_id);
7424   m_extra_cache_part_id= NO_CURRENT_PART_ID;
7425   DBUG_VOID_RETURN;
7426 }
7427 
7428 
7429 /****************************************************************************
7430                 MODULE optimiser support
7431 ****************************************************************************/
7432 
7433 /**
7434   Get keys to use for scanning.
7435 
7436   @return key_map of keys usable for scanning
7437 
7438   @note No need to use read_partitions here, since it does not depend on
7439   which partitions is used, only which storage engine used.
7440 */
7441 
keys_to_use_for_scanning()7442 const key_map *ha_partition::keys_to_use_for_scanning()
7443 {
7444   DBUG_ENTER("ha_partition::keys_to_use_for_scanning");
7445   DBUG_RETURN(m_file[0]->keys_to_use_for_scanning());
7446 }
7447 
7448 
7449 /**
7450   Minimum number of rows to base optimizer estimate on.
7451 */
7452 
min_rows_for_estimate()7453 ha_rows ha_partition::min_rows_for_estimate()
7454 {
7455   uint i, max_used_partitions, tot_used_partitions;
7456   DBUG_ENTER("ha_partition::min_rows_for_estimate");
7457 
7458   tot_used_partitions= bitmap_bits_set(&m_part_info->read_partitions);
7459 
7460   /*
7461     All partitions might have been left as unused during partition pruning
7462     due to, for example, an impossible WHERE condition. Nonetheless, the
7463     optimizer might still attempt to perform (e.g. range) analysis where an
7464     estimate of the the number of rows is calculated using records_in_range.
7465     Hence, to handle this and other possible cases, use zero as the minimum
7466     number of rows to base the estimate on if no partition is being used.
7467   */
7468   if (!tot_used_partitions)
7469     DBUG_RETURN(0);
7470 
7471   /*
7472     Allow O(log2(tot_partitions)) increase in number of used partitions.
7473     This gives O(tot_rows/log2(tot_partitions)) rows to base the estimate on.
7474     I.e when the total number of partitions doubles, allow one more
7475     partition to be checked.
7476   */
7477   i= 2;
7478   max_used_partitions= 1;
7479   while (i < m_tot_parts)
7480   {
7481     max_used_partitions++;
7482     i= i << 1;
7483   }
7484   if (max_used_partitions > tot_used_partitions)
7485     max_used_partitions= tot_used_partitions;
7486 
7487   /* stats.records is already updated by the info(HA_STATUS_VARIABLE) call. */
7488   DBUG_PRINT("info", ("max_used_partitions: %u tot_rows: %lu",
7489                       max_used_partitions,
7490                       (ulong) stats.records));
7491   DBUG_PRINT("info", ("tot_used_partitions: %u min_rows_to_check: %lu",
7492                       tot_used_partitions,
7493                       (ulong) stats.records * max_used_partitions
7494                               / tot_used_partitions));
7495   DBUG_RETURN(stats.records * max_used_partitions / tot_used_partitions);
7496 }
7497 
7498 
7499 /**
7500   Get the biggest used partition.
7501 
7502   Starting at the N:th biggest partition and skips all non used
7503   partitions, returning the biggest used partition found
7504 
7505   @param[in,out] part_index  Skip the *part_index biggest partitions
7506 
7507   @return The biggest used partition with index not lower than *part_index.
7508     @retval NO_CURRENT_PART_ID     No more partition used.
7509     @retval != NO_CURRENT_PART_ID  partition id of biggest used partition with
7510                                    index >= *part_index supplied. Note that
7511                                    *part_index will be updated to the next
7512                                    partition index to use.
7513 */
7514 
get_biggest_used_partition(uint * part_index)7515 uint ha_partition::get_biggest_used_partition(uint *part_index)
7516 {
7517   uint part_id;
7518   while ((*part_index) < m_tot_parts)
7519   {
7520     part_id= m_part_ids_sorted_by_num_of_records[(*part_index)++];
7521     if (bitmap_is_set(&m_part_info->read_partitions, part_id))
7522       return part_id;
7523   }
7524   return NO_CURRENT_PART_ID;
7525 }
7526 
7527 
7528 /*
7529   Return time for a scan of the table
7530 
7531   SYNOPSIS
7532     scan_time()
7533 
7534   RETURN VALUE
7535     time for scan
7536 */
7537 
scan_time()7538 double ha_partition::scan_time()
7539 {
7540   double scan_time= 0;
7541   uint i;
7542   DBUG_ENTER("ha_partition::scan_time");
7543 
7544   for (i= bitmap_get_first_set(&m_part_info->read_partitions);
7545        i < m_tot_parts;
7546        i= bitmap_get_next_set(&m_part_info->read_partitions, i))
7547     scan_time+= m_file[i]->scan_time();
7548   DBUG_RETURN(scan_time);
7549 }
7550 
7551 
7552 /**
7553   Find number of records in a range.
7554   @param inx      Index number
7555   @param min_key  Start of range
7556   @param max_key  End of range
7557 
7558   @return Number of rows in range.
7559 
7560   Given a starting key, and an ending key estimate the number of rows that
7561   will exist between the two. max_key may be empty which in case determine
7562   if start_key matches any rows.
7563 */
7564 
records_in_range(uint inx,key_range * min_key,key_range * max_key)7565 ha_rows ha_partition::records_in_range(uint inx, key_range *min_key,
7566 				       key_range *max_key)
7567 {
7568   ha_rows min_rows_to_check, rows, estimated_rows=0, checked_rows= 0;
7569   uint partition_index= 0, part_id;
7570   DBUG_ENTER("ha_partition::records_in_range");
7571 
7572   min_rows_to_check= min_rows_for_estimate();
7573 
7574   while ((part_id= get_biggest_used_partition(&partition_index))
7575          != NO_CURRENT_PART_ID)
7576   {
7577     rows= m_file[part_id]->records_in_range(inx, min_key, max_key);
7578 
7579     DBUG_PRINT("info", ("part %u match %lu rows of %lu", part_id, (ulong) rows,
7580                         (ulong) m_file[part_id]->stats.records));
7581 
7582     if (rows == HA_POS_ERROR)
7583       DBUG_RETURN(HA_POS_ERROR);
7584     estimated_rows+= rows;
7585     checked_rows+= m_file[part_id]->stats.records;
7586     /*
7587       Returning 0 means no rows can be found, so we must continue
7588       this loop as long as we have estimated_rows == 0.
7589       Also many engines return 1 to indicate that there may exist
7590       a matching row, we do not normalize this by dividing by number of
7591       used partitions, but leave it to be returned as a sum, which will
7592       reflect that we will need to scan each partition's index.
7593 
7594       Note that this statistics may not always be correct, so we must
7595       continue even if the current partition has 0 rows, since we might have
7596       deleted rows from the current partition, or inserted to the next
7597       partition.
7598     */
7599     if (estimated_rows && checked_rows &&
7600         checked_rows >= min_rows_to_check)
7601     {
7602       DBUG_PRINT("info",
7603                  ("records_in_range(inx %u): %lu (%lu * %lu / %lu)",
7604                   inx,
7605                   (ulong) (estimated_rows * stats.records / checked_rows),
7606                   (ulong) estimated_rows,
7607                   (ulong) stats.records,
7608                   (ulong) checked_rows));
7609       DBUG_RETURN(estimated_rows * stats.records / checked_rows);
7610     }
7611   }
7612   DBUG_PRINT("info", ("records_in_range(inx %u): %lu",
7613                       inx,
7614                       (ulong) estimated_rows));
7615   DBUG_RETURN(estimated_rows);
7616 }
7617 
7618 
7619 /**
7620   Estimate upper bound of number of rows.
7621 
7622   @return Number of rows.
7623 */
7624 
estimate_rows_upper_bound()7625 ha_rows ha_partition::estimate_rows_upper_bound()
7626 {
7627   ha_rows rows, tot_rows= 0;
7628   handler **file= m_file;
7629   DBUG_ENTER("ha_partition::estimate_rows_upper_bound");
7630 
7631   do
7632   {
7633     if (bitmap_is_set(&(m_part_info->read_partitions), (file - m_file)))
7634     {
7635       rows= (*file)->estimate_rows_upper_bound();
7636       if (rows == HA_POS_ERROR)
7637         DBUG_RETURN(HA_POS_ERROR);
7638       tot_rows+= rows;
7639     }
7640   } while (*(++file));
7641   DBUG_RETURN(tot_rows);
7642 }
7643 
7644 
7645 /*
7646   Get time to read
7647 
7648   SYNOPSIS
7649     read_time()
7650     index                Index number used
7651     ranges               Number of ranges
7652     rows                 Number of rows
7653 
7654   RETURN VALUE
7655     time for read
7656 
7657   DESCRIPTION
7658     This will be optimised later to include whether or not the index can
7659     be used with partitioning. To achieve we need to add another parameter
7660     that specifies how many of the index fields that are bound in the ranges.
7661     Possibly added as a new call to handlers.
7662 */
7663 
read_time(uint index,uint ranges,ha_rows rows)7664 double ha_partition::read_time(uint index, uint ranges, ha_rows rows)
7665 {
7666   DBUG_ENTER("ha_partition::read_time");
7667 
7668   DBUG_RETURN(m_file[0]->read_time(index, ranges, rows));
7669 }
7670 
7671 
7672 /**
7673   Number of rows in table. see handler.h
7674 
7675   @return Number of records in the table (after pruning!)
7676 */
7677 
records()7678 ha_rows ha_partition::records()
7679 {
7680   ha_rows rows, tot_rows= 0;
7681   uint i;
7682   DBUG_ENTER("ha_partition::records");
7683 
7684   for (i= bitmap_get_first_set(&m_part_info->read_partitions);
7685        i < m_tot_parts;
7686        i= bitmap_get_next_set(&m_part_info->read_partitions, i))
7687   {
7688     rows= m_file[i]->records();
7689     if (rows == HA_POS_ERROR)
7690       DBUG_RETURN(HA_POS_ERROR);
7691     tot_rows+= rows;
7692   }
7693   DBUG_RETURN(tot_rows);
7694 }
7695 
7696 
7697 /*
7698   Is it ok to switch to a new engine for this table
7699 
7700   SYNOPSIS
7701     can_switch_engine()
7702 
7703   RETURN VALUE
7704     TRUE                  Ok
7705     FALSE                 Not ok
7706 
7707   DESCRIPTION
7708     Used to ensure that tables with foreign key constraints are not moved
7709     to engines without foreign key support.
7710 */
7711 
can_switch_engines()7712 bool ha_partition::can_switch_engines()
7713 {
7714   handler **file;
7715   DBUG_ENTER("ha_partition::can_switch_engines");
7716 
7717   file= m_file;
7718   do
7719   {
7720     if (!(*file)->can_switch_engines())
7721       DBUG_RETURN(FALSE);
7722   } while (*(++file));
7723   DBUG_RETURN(TRUE);
7724 }
7725 
7726 
7727 /*
7728   Is table cache supported
7729 
7730   SYNOPSIS
7731     table_cache_type()
7732 
7733 */
7734 
table_cache_type()7735 uint8 ha_partition::table_cache_type()
7736 {
7737   DBUG_ENTER("ha_partition::table_cache_type");
7738 
7739   DBUG_RETURN(m_file[0]->table_cache_type());
7740 }
7741 
7742 
7743 /**
7744   Calculate hash value for KEY partitioning using an array of fields.
7745 
7746   @param field_array   An array of the fields in KEY partitioning
7747 
7748   @return hash_value calculated
7749 
7750   @note Uses the hash function on the character set of the field.
7751   Integer and floating point fields use the binary character set by default.
7752 */
7753 
calculate_key_hash_value(Field ** field_array)7754 uint32 ha_partition::calculate_key_hash_value(Field **field_array)
7755 {
7756   ulong nr1= 1;
7757   ulong nr2= 4;
7758   bool use_51_hash;
7759   use_51_hash= MY_TEST((*field_array)->table->part_info->key_algorithm ==
7760                        partition_info::KEY_ALGORITHM_51);
7761 
7762   do
7763   {
7764     Field *field= *field_array;
7765     if (use_51_hash)
7766     {
7767       switch (field->real_type()) {
7768       case MYSQL_TYPE_TINY:
7769       case MYSQL_TYPE_SHORT:
7770       case MYSQL_TYPE_LONG:
7771       case MYSQL_TYPE_FLOAT:
7772       case MYSQL_TYPE_DOUBLE:
7773       case MYSQL_TYPE_NEWDECIMAL:
7774       case MYSQL_TYPE_TIMESTAMP:
7775       case MYSQL_TYPE_LONGLONG:
7776       case MYSQL_TYPE_INT24:
7777       case MYSQL_TYPE_TIME:
7778       case MYSQL_TYPE_DATETIME:
7779       case MYSQL_TYPE_YEAR:
7780       case MYSQL_TYPE_NEWDATE:
7781         {
7782           if (field->is_null())
7783           {
7784             nr1^= (nr1 << 1) | 1;
7785             continue;
7786           }
7787           /* Force this to my_hash_sort_bin, which was used in 5.1! */
7788           uint len= field->pack_length();
7789           my_charset_bin.coll->hash_sort(&my_charset_bin, field->ptr, len,
7790                                          &nr1, &nr2);
7791           /* Done with this field, continue with next one. */
7792           continue;
7793         }
7794       case MYSQL_TYPE_STRING:
7795       case MYSQL_TYPE_VARCHAR:
7796       case MYSQL_TYPE_BIT:
7797         /* Not affected, same in 5.1 and 5.5 */
7798         break;
7799       /*
7800         ENUM/SET uses my_hash_sort_simple in 5.1 (i.e. my_charset_latin1)
7801         and my_hash_sort_bin in 5.5!
7802       */
7803       case MYSQL_TYPE_ENUM:
7804       case MYSQL_TYPE_SET:
7805         {
7806           if (field->is_null())
7807           {
7808             nr1^= (nr1 << 1) | 1;
7809             continue;
7810           }
7811           /* Force this to my_hash_sort_bin, which was used in 5.1! */
7812           uint len= field->pack_length();
7813           my_charset_latin1.coll->hash_sort(&my_charset_latin1, field->ptr,
7814                                             len, &nr1, &nr2);
7815           continue;
7816         }
7817       /* New types in mysql-5.6. */
7818       case MYSQL_TYPE_DATETIME2:
7819       case MYSQL_TYPE_TIME2:
7820       case MYSQL_TYPE_TIMESTAMP2:
7821         /* Not affected, 5.6+ only! */
7822         break;
7823 
7824       /* These types should not be allowed for partitioning! */
7825       case MYSQL_TYPE_NULL:
7826       case MYSQL_TYPE_DECIMAL:
7827       case MYSQL_TYPE_DATE:
7828       case MYSQL_TYPE_TINY_BLOB:
7829       case MYSQL_TYPE_MEDIUM_BLOB:
7830       case MYSQL_TYPE_LONG_BLOB:
7831       case MYSQL_TYPE_BLOB:
7832       case MYSQL_TYPE_VAR_STRING:
7833       case MYSQL_TYPE_GEOMETRY:
7834         /* fall through. */
7835       default:
7836         DBUG_ASSERT(0);                    // New type?
7837         /* Fall through for default hashing (5.5). */
7838       }
7839       /* fall through, use collation based hashing. */
7840     }
7841     field->hash(&nr1, &nr2);
7842   } while (*(++field_array));
7843   return (uint32) nr1;
7844 }
7845 
7846 
7847 /****************************************************************************
7848                 MODULE print messages
7849 ****************************************************************************/
7850 
index_type(uint inx)7851 const char *ha_partition::index_type(uint inx)
7852 {
7853   uint first_used_partition;
7854   DBUG_ENTER("ha_partition::index_type");
7855 
7856   first_used_partition= bitmap_get_first_set(&(m_part_info->read_partitions));
7857 
7858   if (first_used_partition == MY_BIT_NONE)
7859   {
7860     DBUG_ASSERT(0);                             // How can this happen?
7861     DBUG_RETURN(handler::index_type(inx));
7862   }
7863 
7864   DBUG_RETURN(m_file[first_used_partition]->index_type(inx));
7865 }
7866 
7867 
get_row_type() const7868 enum row_type ha_partition::get_row_type() const
7869 {
7870   uint i;
7871   enum row_type type;
7872   DBUG_ENTER("ha_partition::get_row_type");
7873 
7874   i= bitmap_get_first_set(&m_part_info->read_partitions);
7875   DBUG_ASSERT(i < m_tot_parts);
7876   if (i >= m_tot_parts)
7877     DBUG_RETURN(ROW_TYPE_NOT_USED);
7878 
7879   type= m_file[i]->get_row_type();
7880   DBUG_PRINT("info", ("partition %u, row_type: %d", i, type));
7881 
7882   for (i= bitmap_get_next_set(&m_part_info->lock_partitions, i);
7883        i < m_tot_parts;
7884        i= bitmap_get_next_set(&m_part_info->lock_partitions, i))
7885   {
7886     enum row_type part_type= m_file[i]->get_row_type();
7887     DBUG_PRINT("info", ("partition %u, row_type: %d", i, type));
7888     if (part_type != type)
7889       DBUG_RETURN(ROW_TYPE_NOT_USED);
7890   }
7891 
7892   DBUG_RETURN(type);
7893 }
7894 
7895 
append_row_to_str(String & str)7896 void ha_partition::append_row_to_str(String &str)
7897 {
7898   const uchar *rec;
7899   bool is_rec0= !m_err_rec || m_err_rec == table->record[0];
7900   if (is_rec0)
7901     rec= table->record[0];
7902   else
7903     rec= m_err_rec;
7904   // If PK, use full PK instead of full part field array!
7905   if (table->s->primary_key != MAX_KEY)
7906   {
7907     KEY *key= table->key_info + table->s->primary_key;
7908     KEY_PART_INFO *key_part=     key->key_part;
7909     KEY_PART_INFO *key_part_end= key_part + key->user_defined_key_parts;
7910     if (!is_rec0)
7911       set_key_field_ptr(key, rec, table->record[0]);
7912     for (; key_part != key_part_end; key_part++)
7913     {
7914       Field *field= key_part->field;
7915       str.append(" ");
7916       str.append(field->field_name);
7917       str.append(":");
7918       field_unpack(&str, field, rec, 0, false);
7919     }
7920     if (!is_rec0)
7921       set_key_field_ptr(key, table->record[0], rec);
7922   }
7923   else
7924   {
7925     Field **field_ptr;
7926     if (!is_rec0)
7927       set_field_ptr(m_part_info->full_part_field_array, rec,
7928                     table->record[0]);
7929     /* No primary key, use full partition field array. */
7930     for (field_ptr= m_part_info->full_part_field_array;
7931          *field_ptr;
7932          field_ptr++)
7933     {
7934       Field *field= *field_ptr;
7935       str.append(" ");
7936       str.append(field->field_name);
7937       str.append(":");
7938       field_unpack(&str, field, rec, 0, false);
7939     }
7940     if (!is_rec0)
7941       set_field_ptr(m_part_info->full_part_field_array, table->record[0],
7942                     rec);
7943   }
7944 }
7945 
7946 
print_error(int error,myf errflag)7947 void ha_partition::print_error(int error, myf errflag)
7948 {
7949   THD *thd= ha_thd();
7950   DBUG_ENTER("ha_partition::print_error");
7951 
7952   /* Should probably look for my own errors first */
7953   DBUG_PRINT("enter", ("error: %d", error));
7954 
7955   if ((error == HA_ERR_NO_PARTITION_FOUND) &&
7956       ! (thd->lex->alter_info.flags & Alter_info::ALTER_TRUNCATE_PARTITION))
7957     m_part_info->print_no_partition_found(table);
7958   else if (error == HA_ERR_ROW_IN_WRONG_PARTITION)
7959   {
7960     /* Should only happen on DELETE or UPDATE! */
7961     DBUG_ASSERT(thd_sql_command(thd) == SQLCOM_DELETE ||
7962                 thd_sql_command(thd) == SQLCOM_DELETE_MULTI ||
7963                 thd_sql_command(thd) == SQLCOM_UPDATE ||
7964                 thd_sql_command(thd) == SQLCOM_UPDATE_MULTI);
7965     DBUG_ASSERT(m_err_rec);
7966     if (m_err_rec)
7967     {
7968       uint max_length;
7969       char buf[MAX_KEY_LENGTH];
7970       String str(buf,sizeof(buf),system_charset_info);
7971       uint32 part_id;
7972       str.length(0);
7973       str.append("(");
7974       str.append_ulonglong(m_last_part);
7975       str.append(" != ");
7976       if (get_part_for_delete(m_err_rec, m_rec0, m_part_info, &part_id))
7977         str.append("?");
7978       else
7979         str.append_ulonglong(part_id);
7980       str.append(")");
7981       append_row_to_str(str);
7982 
7983       /* Log this error, so the DBA can notice it and fix it! */
7984       sql_print_error("Table '%-192s' corrupted: row in wrong partition: %s\n"
7985                       "Please REPAIR the table!",
7986                       table->s->table_name.str,
7987                       str.c_ptr_safe());
7988 
7989       max_length= (MYSQL_ERRMSG_SIZE - (uint) strlen(ER(ER_ROW_IN_WRONG_PARTITION)));
7990       if (str.length() >= max_length)
7991       {
7992         str.length(max_length-4);
7993         str.append(STRING_WITH_LEN("..."));
7994       }
7995       my_error(ER_ROW_IN_WRONG_PARTITION, MYF(0), str.c_ptr_safe());
7996       m_err_rec= NULL;
7997       DBUG_VOID_RETURN;
7998     }
7999     /* fall through to generic error handling. */
8000   }
8001 
8002   /* In case m_file has not been initialized, like in bug#42438 */
8003   if (m_file)
8004   {
8005     if (m_last_part >= m_tot_parts)
8006     {
8007       DBUG_ASSERT(0);
8008       m_last_part= 0;
8009     }
8010     m_file[m_last_part]->print_error(error, errflag);
8011   }
8012   else
8013     handler::print_error(error, errflag);
8014   DBUG_VOID_RETURN;
8015 }
8016 
8017 
get_error_message(int error,String * buf)8018 bool ha_partition::get_error_message(int error, String *buf)
8019 {
8020   DBUG_ENTER("ha_partition::get_error_message");
8021 
8022   /* Should probably look for my own errors first */
8023 
8024   /* In case m_file has not been initialized, like in bug#42438 */
8025   if (m_file)
8026     DBUG_RETURN(m_file[m_last_part]->get_error_message(error, buf));
8027   DBUG_RETURN(handler::get_error_message(error, buf));
8028 
8029 }
8030 
8031 
8032 /****************************************************************************
8033                 MODULE in-place ALTER
8034 ****************************************************************************/
8035 /**
8036   Get table flags.
8037 */
8038 
table_flags() const8039 handler::Table_flags ha_partition::table_flags() const
8040 {
8041   uint first_used_partition= 0;
8042   DBUG_ENTER("ha_partition::table_flags");
8043   if (m_handler_status < handler_initialized ||
8044       m_handler_status >= handler_closed)
8045     DBUG_RETURN(PARTITION_ENABLED_TABLE_FLAGS);
8046 
8047   if (get_lock_type() != F_UNLCK)
8048   {
8049     /*
8050       The flags are cached after external_lock, and may depend on isolation
8051       level. So we should use a locked partition to get the correct flags.
8052     */
8053     first_used_partition= bitmap_get_first_set(&m_part_info->lock_partitions);
8054     if (first_used_partition == MY_BIT_NONE)
8055       first_used_partition= 0;
8056   }
8057   DBUG_RETURN((m_file[first_used_partition]->ha_table_flags() &
8058                  ~(PARTITION_DISABLED_TABLE_FLAGS)) |
8059                  (PARTITION_ENABLED_TABLE_FLAGS));
8060 }
8061 
8062 
8063 /**
8064   alter_table_flags must be on handler/table level, not on hton level
8065   due to the ha_partition hton does not know what the underlying hton is.
8066 */
alter_table_flags(uint flags)8067 uint ha_partition::alter_table_flags(uint flags)
8068 {
8069   uint flags_to_return;
8070   DBUG_ENTER("ha_partition::alter_table_flags");
8071 
8072   flags_to_return= ht->alter_table_flags(flags);
8073   flags_to_return|= m_file[0]->alter_table_flags(flags);
8074 
8075   DBUG_RETURN(flags_to_return);
8076 }
8077 
8078 
8079 /**
8080   check if copy of data is needed in alter table.
8081 */
check_if_incompatible_data(HA_CREATE_INFO * create_info,uint table_changes)8082 bool ha_partition::check_if_incompatible_data(HA_CREATE_INFO *create_info,
8083                                               uint table_changes)
8084 {
8085   handler **file;
8086   bool ret= COMPATIBLE_DATA_YES;
8087 
8088   /*
8089     The check for any partitioning related changes have already been done
8090     in mysql_alter_table (by fix_partition_func), so it is only up to
8091     the underlying handlers.
8092   */
8093   for (file= m_file; *file; file++)
8094     if ((ret=  (*file)->check_if_incompatible_data(create_info,
8095                                                    table_changes)) !=
8096         COMPATIBLE_DATA_YES)
8097       break;
8098   return ret;
8099 }
8100 
8101 
8102 /**
8103   Support of in-place alter table.
8104 */
8105 
8106 /**
8107   Helper class for in-place alter, see handler.h
8108 */
8109 
8110 class ha_partition_inplace_ctx : public inplace_alter_handler_ctx
8111 {
8112 public:
8113   inplace_alter_handler_ctx **handler_ctx_array;
8114 private:
8115   uint m_tot_parts;
8116 
8117 public:
ha_partition_inplace_ctx(THD * thd,uint tot_parts)8118   ha_partition_inplace_ctx(THD *thd, uint tot_parts)
8119     : inplace_alter_handler_ctx(),
8120       handler_ctx_array(NULL),
8121       m_tot_parts(tot_parts)
8122   {}
8123 
~ha_partition_inplace_ctx()8124   ~ha_partition_inplace_ctx()
8125   {
8126     if (handler_ctx_array)
8127     {
8128       for (uint index= 0; index < m_tot_parts; index++)
8129         delete handler_ctx_array[index];
8130     }
8131   }
8132 };
8133 
8134 
8135 enum_alter_inplace_result
check_if_supported_inplace_alter(TABLE * altered_table,Alter_inplace_info * ha_alter_info)8136 ha_partition::check_if_supported_inplace_alter(TABLE *altered_table,
8137                                                Alter_inplace_info *ha_alter_info)
8138 {
8139   uint index= 0;
8140   enum_alter_inplace_result result= HA_ALTER_INPLACE_NO_LOCK;
8141   ha_partition_inplace_ctx *part_inplace_ctx;
8142   bool first_is_set= false;
8143   THD *thd= ha_thd();
8144 
8145   DBUG_ENTER("ha_partition::check_if_supported_inplace_alter");
8146   /*
8147     Support inplace change of KEY () -> KEY ALGORITHM = N ().
8148     Any other change would set partition_changed in
8149     prep_alter_part_table() in mysql_alter_table().
8150   */
8151   if (ha_alter_info->alter_info->flags == Alter_info::ALTER_PARTITION)
8152     DBUG_RETURN(HA_ALTER_INPLACE_NO_LOCK);
8153 
8154   /* We cannot allow INPLACE to change order of KEY partitioning fields! */
8155   if (ha_alter_info->handler_flags & Alter_inplace_info::ALTER_COLUMN_ORDER)
8156   {
8157     /* If column partitioning is used then no need to check partition order */
8158     if (m_part_info->list_of_part_fields && !m_part_info->column_list)
8159     {
8160       if(!check_partition_column_order(&ha_alter_info->alter_info->create_list,
8161                                        table->part_info->part_field_array))
8162         DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
8163     }
8164 
8165     /* Check subpartition ordering */
8166     if (m_part_info->list_of_subpart_fields)
8167     {
8168       if(!check_partition_column_order(&ha_alter_info->alter_info->create_list,
8169                                        table->part_info->subpart_field_array))
8170         DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
8171     }
8172   }
8173 
8174   part_inplace_ctx=
8175     new (thd->mem_root) ha_partition_inplace_ctx(thd, m_tot_parts);
8176   if (!part_inplace_ctx)
8177     DBUG_RETURN(HA_ALTER_ERROR);
8178 
8179   part_inplace_ctx->handler_ctx_array= (inplace_alter_handler_ctx **)
8180     thd->alloc(sizeof(inplace_alter_handler_ctx *) * (m_tot_parts + 1));
8181   if (!part_inplace_ctx->handler_ctx_array)
8182     DBUG_RETURN(HA_ALTER_ERROR);
8183 
8184   /* Set all to NULL, including the terminating one. */
8185   for (index= 0; index <= m_tot_parts; index++)
8186     part_inplace_ctx->handler_ctx_array[index]= NULL;
8187 
8188   for (index= 0; index < m_tot_parts; index++)
8189   {
8190     enum_alter_inplace_result p_result=
8191       m_file[index]->check_if_supported_inplace_alter(altered_table,
8192                                                       ha_alter_info);
8193     part_inplace_ctx->handler_ctx_array[index]= ha_alter_info->handler_ctx;
8194 
8195     if (index == 0)
8196     {
8197       first_is_set= (ha_alter_info->handler_ctx != NULL);
8198     }
8199     else if (first_is_set != (ha_alter_info->handler_ctx != NULL))
8200     {
8201       /* Either none or all partitions must set handler_ctx! */
8202       DBUG_ASSERT(0);
8203       DBUG_RETURN(HA_ALTER_ERROR);
8204     }
8205     if (p_result < result)
8206       result= p_result;
8207     if (result == HA_ALTER_ERROR)
8208       break;
8209   }
8210 
8211   ha_alter_info->handler_ctx= part_inplace_ctx;
8212   /*
8213     To indicate for future inplace calls that there are several
8214     partitions/handlers that need to be committed together,
8215     we set group_commit_ctx to the NULL terminated array of
8216     the partitions handlers.
8217   */
8218   ha_alter_info->group_commit_ctx= part_inplace_ctx->handler_ctx_array;
8219 
8220   DBUG_RETURN(result);
8221 }
8222 
8223 
prepare_inplace_alter_table(TABLE * altered_table,Alter_inplace_info * ha_alter_info)8224 bool ha_partition::prepare_inplace_alter_table(TABLE *altered_table,
8225                                                Alter_inplace_info *ha_alter_info)
8226 {
8227   uint index= 0;
8228   bool error= false;
8229   ha_partition_inplace_ctx *part_inplace_ctx;
8230 
8231   DBUG_ENTER("ha_partition::prepare_inplace_alter_table");
8232 
8233   /*
8234     Changing to similar partitioning, only update metadata.
8235     Non allowed changes would be catched in prep_alter_part_table().
8236   */
8237   if (ha_alter_info->alter_info->flags == Alter_info::ALTER_PARTITION)
8238     DBUG_RETURN(false);
8239 
8240   part_inplace_ctx=
8241     static_cast<class ha_partition_inplace_ctx*>(ha_alter_info->handler_ctx);
8242 
8243   for (index= 0; index < m_tot_parts && !error; index++)
8244   {
8245     ha_alter_info->handler_ctx= part_inplace_ctx->handler_ctx_array[index];
8246     m_file[index]->update_create_info(ha_alter_info->create_info);
8247     if (m_file[index]->ha_prepare_inplace_alter_table(altered_table,
8248                                                       ha_alter_info))
8249       error= true;
8250     part_inplace_ctx->handler_ctx_array[index]= ha_alter_info->handler_ctx;
8251   }
8252   ha_alter_info->handler_ctx= part_inplace_ctx;
8253 
8254   DBUG_RETURN(error);
8255 }
8256 
8257 
inplace_alter_table(TABLE * altered_table,Alter_inplace_info * ha_alter_info)8258 bool ha_partition::inplace_alter_table(TABLE *altered_table,
8259                                        Alter_inplace_info *ha_alter_info)
8260 {
8261   uint index= 0;
8262   bool error= false;
8263   ha_partition_inplace_ctx *part_inplace_ctx;
8264 
8265   DBUG_ENTER("ha_partition::inplace_alter_table");
8266 
8267   /*
8268     Changing to similar partitioning, only update metadata.
8269     Non allowed changes would be catched in prep_alter_part_table().
8270   */
8271   if (ha_alter_info->alter_info->flags == Alter_info::ALTER_PARTITION)
8272     DBUG_RETURN(false);
8273 
8274   part_inplace_ctx=
8275     static_cast<class ha_partition_inplace_ctx*>(ha_alter_info->handler_ctx);
8276 
8277   for (index= 0; index < m_tot_parts && !error; index++)
8278   {
8279     ha_alter_info->handler_ctx= part_inplace_ctx->handler_ctx_array[index];
8280 
8281     if (index != 0 && ha_alter_info->handler_ctx != NULL)
8282       ha_alter_info->handler_ctx->set_shared_data(
8283                                 part_inplace_ctx->handler_ctx_array[index - 1]);
8284 
8285     if (m_file[index]->ha_inplace_alter_table(altered_table,
8286                                               ha_alter_info))
8287       error= true;
8288     part_inplace_ctx->handler_ctx_array[index]= ha_alter_info->handler_ctx;
8289   }
8290   ha_alter_info->handler_ctx= part_inplace_ctx;
8291 
8292   DBUG_RETURN(error);
8293 }
8294 
8295 
8296 /*
8297   Note that this function will try rollback failed ADD INDEX by
8298   executing DROP INDEX for the indexes that were committed (if any)
8299   before the error occured. This means that the underlying storage
8300   engine must be able to drop index in-place with X-lock held.
8301   (As X-lock will be held here if new indexes are to be committed)
8302 */
commit_inplace_alter_table(TABLE * altered_table,Alter_inplace_info * ha_alter_info,bool commit)8303 bool ha_partition::commit_inplace_alter_table(TABLE *altered_table,
8304                                               Alter_inplace_info *ha_alter_info,
8305                                               bool commit)
8306 {
8307   ha_partition_inplace_ctx *part_inplace_ctx;
8308   bool error= false;
8309 
8310   DBUG_ENTER("ha_partition::commit_inplace_alter_table");
8311 
8312   /*
8313     Changing to similar partitioning, only update metadata.
8314     Non allowed changes would be catched in prep_alter_part_table().
8315   */
8316   if (ha_alter_info->alter_info->flags == Alter_info::ALTER_PARTITION)
8317     DBUG_RETURN(false);
8318 
8319   part_inplace_ctx=
8320     static_cast<class ha_partition_inplace_ctx*>(ha_alter_info->handler_ctx);
8321 
8322   if (commit)
8323   {
8324     DBUG_ASSERT(ha_alter_info->group_commit_ctx ==
8325                 part_inplace_ctx->handler_ctx_array);
8326     ha_alter_info->handler_ctx= part_inplace_ctx->handler_ctx_array[0];
8327     error= m_file[0]->ha_commit_inplace_alter_table(altered_table,
8328                                                     ha_alter_info, commit);
8329     if (error)
8330       goto end;
8331     if (ha_alter_info->group_commit_ctx)
8332     {
8333       /*
8334         If ha_alter_info->group_commit_ctx is not set to NULL,
8335         then the engine did only commit the first partition!
8336         The engine is probably new, since both innodb and the default
8337         implementation of handler::commit_inplace_alter_table sets it to NULL
8338         and simply return false, since it allows metadata changes only.
8339         Loop over all other partitions as to follow the protocol!
8340       */
8341       uint i;
8342       DBUG_ASSERT(0);
8343       for (i= 1; i < m_tot_parts; i++)
8344       {
8345         ha_alter_info->handler_ctx= part_inplace_ctx->handler_ctx_array[i];
8346         error|= m_file[i]->ha_commit_inplace_alter_table(altered_table,
8347                                                          ha_alter_info,
8348                                                          true);
8349       }
8350     }
8351   }
8352   else
8353   {
8354     uint i;
8355     for (i= 0; i < m_tot_parts; i++)
8356     {
8357       /* Rollback, commit == false,  is done for each partition! */
8358       ha_alter_info->handler_ctx= part_inplace_ctx->handler_ctx_array[i];
8359       if (m_file[i]->ha_commit_inplace_alter_table(altered_table,
8360                                                    ha_alter_info, false))
8361         error= true;
8362     }
8363   }
8364 end:
8365   ha_alter_info->handler_ctx= part_inplace_ctx;
8366 
8367   DBUG_RETURN(error);
8368 }
8369 
8370 
notify_table_changed()8371 void ha_partition::notify_table_changed()
8372 {
8373   handler **file;
8374 
8375   DBUG_ENTER("ha_partition::notify_table_changed");
8376 
8377   for (file= m_file; *file; file++)
8378     (*file)->ha_notify_table_changed();
8379 
8380   DBUG_VOID_RETURN;
8381 }
8382 
8383 
8384 /*
8385   If frm_error() is called then we will use this to to find out what file
8386   extensions exist for the storage engine. This is also used by the default
8387   rename_table and delete_table method in handler.cc.
8388 */
8389 
8390 static const char *ha_partition_ext[]=
8391 {
8392   ha_par_ext, NullS
8393 };
8394 
bas_ext() const8395 const char **ha_partition::bas_ext() const
8396 { return ha_partition_ext; }
8397 
8398 
min_of_the_max_uint(uint (handler::* operator_func)(void)const) const8399 uint ha_partition::min_of_the_max_uint(
8400                        uint (handler::*operator_func)(void) const) const
8401 {
8402   handler **file;
8403   uint min_of_the_max= ((*m_file)->*operator_func)();
8404 
8405   for (file= m_file+1; *file; file++)
8406   {
8407     uint tmp= ((*file)->*operator_func)();
8408     set_if_smaller(min_of_the_max, tmp);
8409   }
8410   return min_of_the_max;
8411 }
8412 
8413 
max_supported_key_parts() const8414 uint ha_partition::max_supported_key_parts() const
8415 {
8416   return min_of_the_max_uint(&handler::max_supported_key_parts);
8417 }
8418 
8419 
max_supported_key_length() const8420 uint ha_partition::max_supported_key_length() const
8421 {
8422   return min_of_the_max_uint(&handler::max_supported_key_length);
8423 }
8424 
8425 
max_supported_key_part_length() const8426 uint ha_partition::max_supported_key_part_length() const
8427 {
8428   return min_of_the_max_uint(&handler::max_supported_key_part_length);
8429 }
8430 
8431 
max_supported_record_length() const8432 uint ha_partition::max_supported_record_length() const
8433 {
8434   return min_of_the_max_uint(&handler::max_supported_record_length);
8435 }
8436 
8437 
max_supported_keys() const8438 uint ha_partition::max_supported_keys() const
8439 {
8440   return min_of_the_max_uint(&handler::max_supported_keys);
8441 }
8442 
8443 
extra_rec_buf_length() const8444 uint ha_partition::extra_rec_buf_length() const
8445 {
8446   handler **file;
8447   uint max= (*m_file)->extra_rec_buf_length();
8448 
8449   for (file= m_file, file++; *file; file++)
8450     if (max < (*file)->extra_rec_buf_length())
8451       max= (*file)->extra_rec_buf_length();
8452   return max;
8453 }
8454 
8455 
min_record_length(uint options) const8456 uint ha_partition::min_record_length(uint options) const
8457 {
8458   handler **file;
8459   uint max= (*m_file)->min_record_length(options);
8460 
8461   for (file= m_file, file++; *file; file++)
8462     if (max < (*file)->min_record_length(options))
8463       max= (*file)->min_record_length(options);
8464   return max;
8465 }
8466 
8467 
8468 /****************************************************************************
8469                 MODULE compare records
8470 ****************************************************************************/
8471 /*
8472   Compare two positions
8473 
8474   SYNOPSIS
8475     cmp_ref()
8476     ref1                   First position
8477     ref2                   Second position
8478 
8479   RETURN VALUE
8480     <0                     ref1 < ref2
8481     0                      Equal
8482     >0                     ref1 > ref2
8483 
8484   DESCRIPTION
8485     We get two references and need to check if those records are the same.
8486     If they belong to different partitions we decide that they are not
8487     the same record. Otherwise we use the particular handler to decide if
8488     they are the same. Sort in partition id order if not equal.
8489 */
8490 
cmp_ref(const uchar * ref1,const uchar * ref2)8491 int ha_partition::cmp_ref(const uchar *ref1, const uchar *ref2)
8492 {
8493   int cmp;
8494   my_ptrdiff_t diff1, diff2;
8495   DBUG_ENTER("ha_partition::cmp_ref");
8496 
8497   cmp = m_file[0]->cmp_ref((ref1 + PARTITION_BYTES_IN_POS),
8498 			   (ref2 + PARTITION_BYTES_IN_POS));
8499   if (cmp)
8500     DBUG_RETURN(cmp);
8501 
8502   if ((ref1[0] == ref2[0]) && (ref1[1] == ref2[1]))
8503   {
8504    /* This means that the references are same and are in same partition.*/
8505     DBUG_RETURN(0);
8506   }
8507 
8508   /*
8509     In Innodb we compare with either primary key value or global DB_ROW_ID so
8510     it is not possible that the two references are equal and are in different
8511     partitions, but in myisam it is possible since we are comparing offsets.
8512     Remove this assert if DB_ROW_ID is changed to be per partition.
8513   */
8514   DBUG_ASSERT(!m_innodb);
8515 
8516   diff1= ref2[1] - ref1[1];
8517   diff2= ref2[0] - ref1[0];
8518   if (diff1 > 0)
8519   {
8520     DBUG_RETURN(-1);
8521   }
8522   if (diff1 < 0)
8523   {
8524     DBUG_RETURN(+1);
8525   }
8526   if (diff2 > 0)
8527   {
8528     DBUG_RETURN(-1);
8529   }
8530   DBUG_RETURN(+1);
8531 }
8532 
8533 
8534 /****************************************************************************
8535                 MODULE auto increment
8536 ****************************************************************************/
8537 
8538 
reset_auto_increment(ulonglong value)8539 int ha_partition::reset_auto_increment(ulonglong value)
8540 {
8541   handler **file= m_file;
8542   int res;
8543   DBUG_ENTER("ha_partition::reset_auto_increment");
8544   lock_auto_increment();
8545   part_share->auto_inc_initialized= false;
8546   part_share->next_auto_inc_val= 0;
8547   do
8548   {
8549     if ((res= (*file)->ha_reset_auto_increment(value)) != 0)
8550       break;
8551   } while (*(++file));
8552   unlock_auto_increment();
8553   DBUG_RETURN(res);
8554 }
8555 
8556 
8557 /**
8558   This method is called by update_auto_increment which in turn is called
8559   by the individual handlers as part of write_row. We use the
8560   part_share->next_auto_inc_val, or search all
8561   partitions for the highest auto_increment_value if not initialized or
8562   if auto_increment field is a secondary part of a key, we must search
8563   every partition when holding a mutex to be sure of correctness.
8564 */
8565 
get_auto_increment(ulonglong offset,ulonglong increment,ulonglong nb_desired_values,ulonglong * first_value,ulonglong * nb_reserved_values)8566 void ha_partition::get_auto_increment(ulonglong offset, ulonglong increment,
8567                                       ulonglong nb_desired_values,
8568                                       ulonglong *first_value,
8569                                       ulonglong *nb_reserved_values)
8570 {
8571   DBUG_ENTER("ha_partition::get_auto_increment");
8572   DBUG_PRINT("info", ("offset: %lu inc: %lu desired_values: %lu "
8573                       "first_value: %lu", (ulong) offset, (ulong) increment,
8574                       (ulong) nb_desired_values, (ulong) *first_value));
8575   DBUG_ASSERT(increment && nb_desired_values);
8576   *first_value= 0;
8577   if (table->s->next_number_keypart)
8578   {
8579     /*
8580       next_number_keypart is != 0 if the auto_increment column is a secondary
8581       column in the index (it is allowed in MyISAM)
8582     */
8583     DBUG_PRINT("info", ("next_number_keypart != 0"));
8584     ulonglong nb_reserved_values_part;
8585     ulonglong first_value_part, max_first_value;
8586     handler **file= m_file;
8587     first_value_part= max_first_value= *first_value;
8588     /* Must lock and find highest value among all partitions. */
8589     lock_auto_increment();
8590     do
8591     {
8592       /* Only nb_desired_values = 1 makes sense */
8593       (*file)->get_auto_increment(offset, increment, 1,
8594                                  &first_value_part, &nb_reserved_values_part);
8595       if (first_value_part == ULONGLONG_MAX) // error in one partition
8596       {
8597         *first_value= first_value_part;
8598         /* log that the error was between table/partition handler */
8599         sql_print_error("Partition failed to reserve auto_increment value");
8600         unlock_auto_increment();
8601         DBUG_VOID_RETURN;
8602       }
8603       DBUG_PRINT("info", ("first_value_part: %lu", (ulong) first_value_part));
8604       set_if_bigger(max_first_value, first_value_part);
8605     } while (*(++file));
8606     *first_value= max_first_value;
8607     *nb_reserved_values= 1;
8608     unlock_auto_increment();
8609   }
8610   else
8611   {
8612     THD *thd= ha_thd();
8613     /*
8614       This is initialized in the beginning of the first write_row call.
8615     */
8616     DBUG_ASSERT(part_share->auto_inc_initialized);
8617     /*
8618       Get a lock for handling the auto_increment in part_share
8619       for avoiding two concurrent statements getting the same number.
8620     */
8621 
8622     lock_auto_increment();
8623 
8624     /*
8625       In a multi-row insert statement like INSERT SELECT and LOAD DATA
8626       where the number of candidate rows to insert is not known in advance
8627       we must hold a lock/mutex for the whole statement if we have statement
8628       based replication. Because the statement-based binary log contains
8629       only the first generated value used by the statement, and slaves assumes
8630       all other generated values used by this statement were consecutive to
8631       this first one, we must exclusively lock the generator until the statement
8632       is done.
8633     */
8634     if (!auto_increment_safe_stmt_log_lock &&
8635         thd->lex->sql_command != SQLCOM_INSERT &&
8636         mysql_bin_log.is_open() &&
8637         !thd->is_current_stmt_binlog_format_row() &&
8638         (thd->variables.option_bits & OPTION_BIN_LOG))
8639     {
8640       DBUG_PRINT("info", ("locking auto_increment_safe_stmt_log_lock"));
8641       auto_increment_safe_stmt_log_lock= TRUE;
8642     }
8643 
8644     /* this gets corrected (for offset/increment) in update_auto_increment */
8645     *first_value= part_share->next_auto_inc_val;
8646     part_share->next_auto_inc_val+= nb_desired_values * increment;
8647 
8648     unlock_auto_increment();
8649     DBUG_PRINT("info", ("*first_value: %lu", (ulong) *first_value));
8650     *nb_reserved_values= nb_desired_values;
8651   }
8652   DBUG_VOID_RETURN;
8653 }
8654 
release_auto_increment()8655 void ha_partition::release_auto_increment()
8656 {
8657   DBUG_ENTER("ha_partition::release_auto_increment");
8658 
8659   if (table->s->next_number_keypart)
8660   {
8661     uint i;
8662     for (i= bitmap_get_first_set(&m_part_info->lock_partitions);
8663          i < m_tot_parts;
8664          i= bitmap_get_next_set(&m_part_info->lock_partitions, i))
8665     {
8666       m_file[i]->ha_release_auto_increment();
8667     }
8668   }
8669   else if (next_insert_id)
8670   {
8671     ulonglong next_auto_inc_val;
8672     lock_auto_increment();
8673     next_auto_inc_val= part_share->next_auto_inc_val;
8674     /*
8675       If the current auto_increment values is lower than the reserved
8676       value, and the reserved value was reserved by this thread,
8677       we can lower the reserved value.
8678     */
8679     if (next_insert_id < next_auto_inc_val &&
8680         auto_inc_interval_for_cur_row.maximum() >= next_auto_inc_val)
8681     {
8682       THD *thd= ha_thd();
8683       /*
8684         Check that we do not lower the value because of a failed insert
8685         with SET INSERT_ID, i.e. forced/non generated values.
8686       */
8687       if (thd->auto_inc_intervals_forced.maximum() < next_insert_id)
8688         part_share->next_auto_inc_val= next_insert_id;
8689     }
8690     DBUG_PRINT("info", ("part_share->next_auto_inc_val: %lu",
8691                         (ulong) part_share->next_auto_inc_val));
8692 
8693     /* Unlock the multi row statement lock taken in get_auto_increment */
8694     if (auto_increment_safe_stmt_log_lock)
8695     {
8696       auto_increment_safe_stmt_log_lock= FALSE;
8697       DBUG_PRINT("info", ("unlocking auto_increment_safe_stmt_log_lock"));
8698     }
8699 
8700     unlock_auto_increment();
8701   }
8702   DBUG_VOID_RETURN;
8703 }
8704 
8705 /****************************************************************************
8706                 MODULE initialize handler for HANDLER call
8707 ****************************************************************************/
8708 
init_table_handle_for_HANDLER()8709 void ha_partition::init_table_handle_for_HANDLER()
8710 {
8711   return;
8712 }
8713 
8714 
8715 /**
8716   Return the checksum of the table (all partitions)
8717 */
8718 
checksum() const8719 uint ha_partition::checksum() const
8720 {
8721   ha_checksum sum= 0;
8722 
8723   DBUG_ENTER("ha_partition::checksum");
8724   if ((table_flags() & HA_HAS_CHECKSUM))
8725   {
8726     handler **file= m_file;
8727     do
8728     {
8729       sum+= (*file)->checksum();
8730     } while (*(++file));
8731   }
8732   DBUG_RETURN(sum);
8733 }
8734 
8735 
8736 /****************************************************************************
8737                 MODULE enable/disable indexes
8738 ****************************************************************************/
8739 
8740 /*
8741   Disable indexes for a while
8742   SYNOPSIS
8743     disable_indexes()
8744     mode                      Mode
8745   RETURN VALUES
8746     0                         Success
8747     != 0                      Error
8748 */
8749 
disable_indexes(uint mode)8750 int ha_partition::disable_indexes(uint mode)
8751 {
8752   handler **file;
8753   int error= 0;
8754 
8755   DBUG_ASSERT(bitmap_is_set_all(&(m_part_info->lock_partitions)));
8756   for (file= m_file; *file; file++)
8757   {
8758     if ((error= (*file)->ha_disable_indexes(mode)))
8759       break;
8760   }
8761   return error;
8762 }
8763 
8764 
8765 /*
8766   Enable indexes again
8767   SYNOPSIS
8768     enable_indexes()
8769     mode                      Mode
8770   RETURN VALUES
8771     0                         Success
8772     != 0                      Error
8773 */
8774 
enable_indexes(uint mode)8775 int ha_partition::enable_indexes(uint mode)
8776 {
8777   handler **file;
8778   int error= 0;
8779 
8780   DBUG_ASSERT(bitmap_is_set_all(&(m_part_info->lock_partitions)));
8781   for (file= m_file; *file; file++)
8782   {
8783     if ((error= (*file)->ha_enable_indexes(mode)))
8784       break;
8785   }
8786   return error;
8787 }
8788 
8789 
8790 /*
8791   Check if indexes are disabled
8792   SYNOPSIS
8793     indexes_are_disabled()
8794 
8795   RETURN VALUES
8796     0                      Indexes are enabled
8797     != 0                   Indexes are disabled
8798 */
8799 
indexes_are_disabled(void)8800 int ha_partition::indexes_are_disabled(void)
8801 {
8802   handler **file;
8803   int error= 0;
8804 
8805   DBUG_ASSERT(bitmap_is_set_all(&(m_part_info->lock_partitions)));
8806   for (file= m_file; *file; file++)
8807   {
8808     if ((error= (*file)->indexes_are_disabled()))
8809       break;
8810   }
8811   return error;
8812 }
8813 
8814 
8815 /**
8816   Check/fix misplaced rows.
8817 
8818   @param read_part_id  Partition to check/fix.
8819   @param repair        If true, move misplaced rows to correct partition.
8820 
8821   @return Operation status.
8822     @retval 0     Success
8823     @retval != 0  Error
8824 */
8825 
check_misplaced_rows(uint read_part_id,bool repair)8826 int ha_partition::check_misplaced_rows(uint read_part_id, bool repair)
8827 {
8828   int result= 0;
8829   uint32 correct_part_id;
8830   longlong func_value;
8831   longlong num_misplaced_rows= 0;
8832 
8833   DBUG_ENTER("ha_partition::check_misplaced_rows");
8834 
8835   DBUG_ASSERT(m_file);
8836 
8837   if (repair)
8838   {
8839     /* We must read the full row, if we need to move it! */
8840     bitmap_set_all(table->read_set);
8841     bitmap_set_all(table->write_set);
8842   }
8843   else
8844   {
8845     /* Only need to read the partitioning fields. */
8846     bitmap_union(table->read_set, &m_part_info->full_part_field_set);
8847   }
8848 
8849   if ((result= m_file[read_part_id]->ha_rnd_init(1)))
8850     DBUG_RETURN(result);
8851 
8852   while (true)
8853   {
8854     if ((result= m_file[read_part_id]->ha_rnd_next(m_rec0)))
8855     {
8856       if (result == HA_ERR_RECORD_DELETED)
8857         continue;
8858       if (result != HA_ERR_END_OF_FILE)
8859         break;
8860 
8861       if (num_misplaced_rows > 0)
8862       {
8863 	print_admin_msg(ha_thd(), MI_MAX_MSG_BUF, "warning",
8864                         table_share->db.str, table->alias,
8865                         opt_op_name[REPAIR_PARTS],
8866                         "Moved %lld misplaced rows",
8867                         num_misplaced_rows);
8868       }
8869       /* End-of-file reached, all rows are now OK, reset result and break. */
8870       result= 0;
8871       break;
8872     }
8873 
8874     result= m_part_info->get_partition_id(m_part_info, &correct_part_id,
8875                                           &func_value);
8876     if (result)
8877       break;
8878 
8879     if (correct_part_id != read_part_id)
8880     {
8881       num_misplaced_rows++;
8882       if (!repair)
8883       {
8884         /* Check. */
8885 	print_admin_msg(ha_thd(), MI_MAX_MSG_BUF, "error",
8886                         table_share->db.str, table->alias,
8887                         opt_op_name[CHECK_PARTS],
8888                         "Found a misplaced row");
8889         /* Break on first misplaced row! */
8890         result= HA_ADMIN_NEEDS_UPGRADE;
8891         break;
8892       }
8893       else
8894       {
8895         DBUG_PRINT("info", ("Moving row from partition %d to %d",
8896                             read_part_id, correct_part_id));
8897 
8898         /*
8899           Insert row into correct partition. Notice that there are no commit
8900           for every N row, so the repair will be one large transaction!
8901         */
8902         if ((result= m_file[correct_part_id]->ha_write_row(m_rec0)))
8903         {
8904           /*
8905             We have failed to insert a row, it might have been a duplicate!
8906           */
8907           char buf[MAX_KEY_LENGTH];
8908           String str(buf,sizeof(buf),system_charset_info);
8909           str.length(0);
8910           if (result == HA_ERR_FOUND_DUPP_KEY)
8911           {
8912             str.append("Duplicate key found, "
8913                        "please update or delete the record:\n");
8914             result= HA_ADMIN_CORRUPT;
8915           }
8916           m_err_rec= NULL;
8917           append_row_to_str(str);
8918 
8919           /*
8920             If the engine supports transactions, the failure will be
8921             rollbacked.
8922           */
8923           if (!m_file[correct_part_id]->has_transactions())
8924           {
8925             /* Log this error, so the DBA can notice it and fix it! */
8926             sql_print_error("Table '%-192s' failed to move/insert a row"
8927                             " from part %d into part %d:\n%s",
8928                             table->s->table_name.str,
8929                             read_part_id,
8930                             correct_part_id,
8931                             str.c_ptr_safe());
8932           }
8933 	  print_admin_msg(ha_thd(), MI_MAX_MSG_BUF, "error",
8934                           table_share->db.str, table->alias,
8935                           opt_op_name[REPAIR_PARTS],
8936                           "Failed to move/insert a row"
8937                           " from part %d into part %d:\n%s",
8938                           read_part_id,
8939                           correct_part_id,
8940                           str.c_ptr_safe());
8941           break;
8942         }
8943 
8944         /* Delete row from wrong partition. */
8945         if ((result= m_file[read_part_id]->ha_delete_row(m_rec0)))
8946         {
8947           if (m_file[correct_part_id]->has_transactions())
8948             break;
8949           /*
8950             We have introduced a duplicate, since we failed to remove it
8951             from the wrong partition.
8952           */
8953           char buf[MAX_KEY_LENGTH];
8954           String str(buf,sizeof(buf),system_charset_info);
8955           str.length(0);
8956           m_err_rec= NULL;
8957           append_row_to_str(str);
8958 
8959           /* Log this error, so the DBA can notice it and fix it! */
8960           sql_print_error("Table '%-192s': Delete from part %d failed with"
8961                           " error %d. But it was already inserted into"
8962                           " part %d, when moving the misplaced row!"
8963                           "\nPlease manually fix the duplicate row:\n%s",
8964                           table->s->table_name.str,
8965                           read_part_id,
8966                           result,
8967                           correct_part_id,
8968                           str.c_ptr_safe());
8969           break;
8970         }
8971       }
8972     }
8973   }
8974 
8975   int tmp_result= m_file[read_part_id]->ha_rnd_end();
8976   DBUG_RETURN(result ? result : tmp_result);
8977 }
8978 
8979 
8980 #define KEY_PARTITIONING_CHANGED_STR \
8981   "KEY () partitioning changed, please run:\n" \
8982   "ALTER TABLE %s.%s ALGORITHM = INPLACE %s"
8983 
check_for_upgrade(HA_CHECK_OPT * check_opt)8984 int ha_partition::check_for_upgrade(HA_CHECK_OPT *check_opt)
8985 {
8986   int error= HA_ADMIN_NEEDS_CHECK;
8987   DBUG_ENTER("ha_partition::check_for_upgrade");
8988 
8989   /*
8990     This is called even without FOR UPGRADE,
8991     if the .frm version is lower than the current version.
8992     In that case return that it needs checking!
8993   */
8994   if (!(check_opt->sql_flags & TT_FOR_UPGRADE))
8995     DBUG_RETURN(error);
8996 
8997   /*
8998     Partitions will be checked for during their ha_check!
8999 
9000     Check if KEY (sub)partitioning was used and any field's hash calculation
9001     differs from 5.1, see bug#14521864.
9002   */
9003   if (table->s->mysql_version < 50503 &&              // 5.1 table (<5.5.3)
9004       ((m_part_info->part_type == HASH_PARTITION &&   // KEY partitioned
9005         m_part_info->list_of_part_fields) ||
9006        (m_is_sub_partitioned &&                       // KEY subpartitioned
9007         m_part_info->list_of_subpart_fields)))
9008   {
9009     Field **field;
9010     if (m_is_sub_partitioned)
9011     {
9012       field= m_part_info->subpart_field_array;
9013     }
9014     else
9015     {
9016       field= m_part_info->part_field_array;
9017     }
9018     for (; *field; field++)
9019     {
9020       switch ((*field)->real_type()) {
9021       case MYSQL_TYPE_TINY:
9022       case MYSQL_TYPE_SHORT:
9023       case MYSQL_TYPE_LONG:
9024       case MYSQL_TYPE_FLOAT:
9025       case MYSQL_TYPE_DOUBLE:
9026       case MYSQL_TYPE_NEWDECIMAL:
9027       case MYSQL_TYPE_TIMESTAMP:
9028       case MYSQL_TYPE_LONGLONG:
9029       case MYSQL_TYPE_INT24:
9030       case MYSQL_TYPE_TIME:
9031       case MYSQL_TYPE_DATETIME:
9032       case MYSQL_TYPE_YEAR:
9033       case MYSQL_TYPE_NEWDATE:
9034       case MYSQL_TYPE_ENUM:
9035       case MYSQL_TYPE_SET:
9036         {
9037           THD *thd= ha_thd();
9038           char *part_buf;
9039           String db_name, table_name;
9040           uint part_buf_len;
9041           bool skip_generation= false;
9042           partition_info::enum_key_algorithm old_algorithm;
9043           old_algorithm= m_part_info->key_algorithm;
9044           error= HA_ADMIN_FAILED;
9045           append_identifier(ha_thd(), &db_name, table_share->db.str,
9046                             table_share->db.length);
9047           append_identifier(ha_thd(), &table_name, table_share->table_name.str,
9048                             table_share->table_name.length);
9049           if (m_part_info->key_algorithm != partition_info::KEY_ALGORITHM_NONE)
9050           {
9051             /*
9052               Only possible when someone tampered with .frm files,
9053               like during tests :)
9054             */
9055             skip_generation= true;
9056           }
9057           m_part_info->key_algorithm= partition_info::KEY_ALGORITHM_51;
9058           if (skip_generation ||
9059               !(part_buf= generate_partition_syntax(m_part_info,
9060                                                     &part_buf_len,
9061                                                     true,
9062                                                     true,
9063                                                     NULL,
9064                                                     NULL,
9065                                                     NULL)) ||
9066 	      print_admin_msg(thd, SQL_ADMIN_MSG_TEXT_SIZE + 1, "error",
9067 	                      table_share->db.str,
9068 	                      table->alias,
9069                               opt_op_name[CHECK_PARTS],
9070                               KEY_PARTITIONING_CHANGED_STR,
9071                               db_name.c_ptr_safe(),
9072                               table_name.c_ptr_safe(),
9073                               part_buf))
9074 	  {
9075 	    /* Error creating admin message (too long string?). */
9076 	    print_admin_msg(thd, MI_MAX_MSG_BUF, "error",
9077                             table_share->db.str, table->alias,
9078                             opt_op_name[CHECK_PARTS],
9079                             KEY_PARTITIONING_CHANGED_STR,
9080                             db_name.c_ptr_safe(), table_name.c_ptr_safe(),
9081                             "<old partition clause>, but add ALGORITHM = 1"
9082                             " between 'KEY' and '(' to change the metadata"
9083                             " without the need of a full table rebuild.");
9084           }
9085           m_part_info->key_algorithm= old_algorithm;
9086           DBUG_RETURN(error);
9087         }
9088       default:
9089         /* Not affected! */
9090         ;
9091       }
9092     }
9093   }
9094 
9095   DBUG_RETURN(error);
9096 }
9097 
9098 
9099 struct st_mysql_storage_engine partition_storage_engine=
9100 { MYSQL_HANDLERTON_INTERFACE_VERSION };
9101 
mysql_declare_plugin(partition)9102 mysql_declare_plugin(partition)
9103 {
9104   MYSQL_STORAGE_ENGINE_PLUGIN,
9105   &partition_storage_engine,
9106   "partition",
9107   "Mikael Ronstrom, MySQL AB",
9108   "Partition Storage Engine Helper",
9109   PLUGIN_LICENSE_GPL,
9110   partition_initialize, /* Plugin Init */
9111   NULL, /* Plugin Deinit */
9112   0x0100, /* 1.0 */
9113   NULL,                       /* status variables                */
9114   NULL,                       /* system variables                */
9115   NULL,                       /* config options                  */
9116   0,                          /* flags                           */
9117 }
9118 mysql_declare_plugin_end;
9119 
9120 #endif
9121