1 #ifndef HISTOGRAMS_HISTOGRAM_INCLUDED
2 #define HISTOGRAMS_HISTOGRAM_INCLUDED
3 
4 /* Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License, version 2.0,
8    as published by the Free Software Foundation.
9 
10    This program is also distributed with certain software (including
11    but not limited to OpenSSL) that is licensed under separate terms,
12    as designated in a particular file or component or in included license
13    documentation.  The authors of MySQL hereby grant you an additional
14    permission to link the program and your derivative works with the
15    separately licensed software that they have included with MySQL.
16 
17    This program is distributed in the hope that it will be useful,
18    but WITHOUT ANY WARRANTY; without even the implied warranty of
19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20    GNU General Public License, version 2.0, for more details.
21 
22    You should have received a copy of the GNU General Public License
23    along with this program; if not, write to the Free Software
24    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
25 
26 /**
27   @file sql/histograms/histogram.h
28   Histogram base class.
29 
30   This file defines the base class for all histogram types. We keep the base
31   class itself non-templatized in order to more easily send a histogram as an
32   argument, collect multiple histograms in a single collection etc.
33 
34   A histogram is stored as a JSON object. This gives the flexibility of storing
35   virtually an unlimited number of buckets, data values in its full length and
36   easily expanding with new histogram types in the future. They are stored
37   persistently in the system table mysql.column_stats.
38 
39   We keep all histogram code in the namespace "histograms" in order to avoid
40   name conflicts etc.
41 */
42 
43 #include <cstddef>  // size_t
44 #include <functional>
45 #include <map>      // std::map
46 #include <set>      // std::set
47 #include <string>   // std::string
48 #include <utility>  // std::pair
49 
50 #include "lex_string.h"  // LEX_CSTRING
51 #include "my_base.h"     // ha_rows
52 #include "sql/histograms/value_map_type.h"
53 #include "sql/mem_root_allocator.h"   // Mem_root_allocator
54 #include "sql/stateless_allocator.h"  // Stateless_allocator
55 
56 class Item;
57 class Json_dom;
58 class Json_object;
59 class THD;
60 struct TYPELIB;
61 
62 namespace dd {
63 class Table;
64 }  // namespace dd
65 namespace histograms {
66 struct Histogram_comparator;
67 template <class T>
68 class Value_map;
69 }  // namespace histograms
70 struct CHARSET_INFO;
71 struct MEM_ROOT;
72 struct TABLE_LIST;
73 
74 namespace histograms {
75 
76 /// The default (and invalid) value for "m_null_values_fraction".
77 static const double INVALID_NULL_VALUES_FRACTION = -1.0;
78 
79 enum class Message {
80   FIELD_NOT_FOUND,
81   UNSUPPORTED_DATA_TYPE,
82   TEMPORARY_TABLE,
83   ENCRYPTED_TABLE,
84   VIEW,
85   HISTOGRAM_CREATED,
86   MULTIPLE_TABLES_SPECIFIED,
87   COVERED_BY_SINGLE_PART_UNIQUE_INDEX,
88   NO_HISTOGRAM_FOUND,
89   HISTOGRAM_DELETED,
90   SERVER_READ_ONLY
91 };
92 
93 struct Histogram_psi_key_alloc {
94   void *operator()(size_t s) const;
95 };
96 
97 template <class T>
98 using Histogram_key_allocator = Stateless_allocator<T, Histogram_psi_key_alloc>;
99 
100 template <class T>
101 using value_map_allocator = Mem_root_allocator<std::pair<const T, ha_rows>>;
102 
103 template <typename T>
104 using value_map_type =
105     std::map<T, ha_rows, Histogram_comparator, value_map_allocator<T>>;
106 
107 using columns_set = std::set<std::string, std::less<std::string>,
108                              Histogram_key_allocator<std::string>>;
109 
110 using results_map =
111     std::map<std::string, Message, std::less<std::string>,
112              Histogram_key_allocator<std::pair<const std::string, Message>>>;
113 
114 /**
115   The different operators we can ask histogram statistics for selectivity
116   estimations.
117 */
118 enum class enum_operator {
119   EQUALS_TO,
120   GREATER_THAN,
121   LESS_THAN,
122   IS_NULL,
123   IS_NOT_NULL,
124   LESS_THAN_OR_EQUAL,
125   GREATER_THAN_OR_EQUAL,
126   NOT_EQUALS_TO,
127   BETWEEN,
128   NOT_BETWEEN,
129   IN_LIST,
130   NOT_IN_LIST
131 };
132 
133 /**
134   Histogram base class.
135 */
136 class Histogram {
137  public:
138   /// All supported histogram types in MySQL.
139   enum class enum_histogram_type { EQUI_HEIGHT, SINGLETON };
140 
141   /// String representation of the JSON field "histogram-type".
histogram_type_str()142   static constexpr const char *histogram_type_str() { return "histogram-type"; }
143 
144   /// String representation of the JSON field "data-type".
data_type_str()145   static constexpr const char *data_type_str() { return "data-type"; }
146 
147   /// String representation of the JSON field "collation-id".
collation_id_str()148   static constexpr const char *collation_id_str() { return "collation-id"; }
149 
150   /// String representation of the histogram type SINGLETON.
singleton_str()151   static constexpr const char *singleton_str() { return "singleton"; }
152 
153   /// String representation of the histogram type EQUI-HEIGHT.
equi_height_str()154   static constexpr const char *equi_height_str() { return "equi-height"; }
155 
156  protected:
157   double m_sampling_rate;
158 
159   /// The fraction of NULL values in the histogram (between 0.0 and 1.0).
160   double m_null_values_fraction;
161 
162   /// The character set for the data stored
163   const CHARSET_INFO *m_charset;
164 
165   /// The number of buckets originally specified
166   size_t m_num_buckets_specified;
167 
168   /// String representation of the JSON field "buckets".
buckets_str()169   static constexpr const char *buckets_str() { return "buckets"; }
170 
171   /// String representation of the JSON field "last-updated".
last_updated_str()172   static constexpr const char *last_updated_str() { return "last-updated"; }
173 
174   /// String representation of the JSON field "null-values".
null_values_str()175   static constexpr const char *null_values_str() { return "null-values"; }
176 
sampling_rate_str()177   static constexpr const char *sampling_rate_str() { return "sampling-rate"; }
178 
179   /// String representation of the JSON field "number-of-buckets-specified".
numer_of_buckets_specified_str()180   static constexpr const char *numer_of_buckets_specified_str() {
181     return "number-of-buckets-specified";
182   }
183 
184   /**
185     Write the data type of this histogram into a JSON object.
186 
187     @param json_object the JSON object where we will write the histogram
188                        data type
189 
190     @return true on error, false otherwise
191   */
192   bool histogram_data_type_to_json(Json_object *json_object) const;
193 
194   /**
195     Return the value that is contained in the JSON DOM object.
196 
197     For most types, this function simply returns the contained value. For String
198     values, the value is allocated on this histograms MEM_ROOT before it is
199     returned. This allows the String value to survive the entire lifetime of the
200     histogram object.
201 
202     @param json_dom the JSON DOM object to extract the value from
203     @param out the value from the JSON DOM object
204 
205     @return true on error, false otherwise
206   */
207   template <class T>
208   bool extract_json_dom_value(const Json_dom *json_dom, T *out);
209 
210   /**
211     Populate the histogram with data from the provided JSON object. The base
212     class also provides an implementation that subclasses must call in order
213     to populate fields that are shared among all histogram types (character set,
214     null values fraction).
215 
216     @param json_object the JSON object to read the histogram data from
217 
218     @return true on error, false otherwise
219   */
220   virtual bool json_to_histogram(const Json_object &json_object) = 0;
221 
222  private:
223   /// The MEM_ROOT where the histogram contents will be allocated.
224   MEM_ROOT *m_mem_root;
225 
226   /// The type of this histogram.
227   const enum_histogram_type m_hist_type;
228 
229   /// The type of the data this histogram contains.
230   const Value_map_type m_data_type;
231 
232   /// Name of the database this histogram represents.
233   LEX_CSTRING m_database_name;
234 
235   /// Name of the table this histogram represents.
236   LEX_CSTRING m_table_name;
237 
238   /// Name of the column this histogram represents.
239   LEX_CSTRING m_column_name;
240 
241   /**
242     An internal function for getting the selecitvity estimation.
243 
244     This function will read/evaluate the value from the given Item, and pass
245     this value on to the correct selectivity estimation function based on the
246     data type of the histogram. For instance, if the data type of the histogram
247     is INT, we will call "val_int" on the Item to evaulate the value as an
248     integer and pass this value on to the next function.
249 
250     @param item The Item to read/evaluate the value from.
251     @param op The operator we are estimating the selectivity for.
252     @param typelib In the case of ENUM or SET data type, this parameter holds
253                    the type information. This is needed in order to map a
254                    string representation of an ENUM/SET value into its correct
255                    integer representation (ENUM/SET values are stored as
256                    integer values in the histogram).
257     @param[out] selectivity The estimated selectivity, between 0.0 and 1.0
258                 inclusive.
259 
260     @return true on error (i.e the provided item was NULL), false on success.
261   */
262   bool get_selectivity_dispatcher(Item *item, const enum_operator op,
263                                   const TYPELIB *typelib,
264                                   double *selectivity) const;
265 
266   /**
267     An internal function for getting the selecitvity estimation.
268 
269     This function will cast the histogram to the correct class (using down_cast)
270     and pass the given value on to the correct selectivity estimation function
271     for that class.
272 
273     @param value The value to estimate the selectivity for.
274 
275     @return The estimated selectivity, between 0.0 and 1.0 inclusive.
276   */
277   template <class T>
278   double get_less_than_selectivity_dispatcher(const T &value) const;
279 
280   /// @see get_less_than_selectivity_dispatcher
281   template <class T>
282   double get_greater_than_selectivity_dispatcher(const T &value) const;
283 
284   /// @see get_less_than_selectivity_dispatcher
285   template <class T>
286   double get_equal_to_selectivity_dispatcher(const T &value) const;
287 
288   /**
289     An internal function for applying the correct function for the given
290     operator.
291 
292     @param op    The operator to apply
293     @param value The value to find the selectivity for.
294 
295     @return The estimated selectivity, between 0.0 and 1.0 inclusive.
296   */
297   template <class T>
298   double apply_operator(const enum_operator op, const T &value) const;
299 
300  public:
301   /**
302     Constructor.
303 
304     @param mem_root  the mem_root where the histogram contents will be allocated
305     @param db_name   name of the database this histogram represents
306     @param tbl_name  name of the table this histogram represents
307     @param col_name  name of the column this histogram represents
308     @param type      the histogram type (equi-height, singleton)
309     @param data_type the type of data that this histogram contains
310   */
311   Histogram(MEM_ROOT *mem_root, const std::string &db_name,
312             const std::string &tbl_name, const std::string &col_name,
313             enum_histogram_type type, Value_map_type data_type);
314 
315   /**
316     Copy constructor
317 
318     This will make a copy of the provided histogram onto the provided MEM_ROOT.
319   */
320   Histogram(MEM_ROOT *mem_root, const Histogram &other);
321 
322   Histogram(const Histogram &other) = delete;
323 
324   /// Destructor.
~Histogram()325   virtual ~Histogram() {}
326 
327   /// @return the MEM_ROOT that this histogram uses for allocations
get_mem_root()328   MEM_ROOT *get_mem_root() const { return m_mem_root; }
329 
330   /**
331     @return name of the database this histogram represents
332   */
get_database_name()333   const LEX_CSTRING get_database_name() const { return m_database_name; }
334 
335   /**
336     @return name of the table this histogram represents
337   */
get_table_name()338   const LEX_CSTRING get_table_name() const { return m_table_name; }
339 
340   /**
341     @return name of the column this histogram represents
342   */
get_column_name()343   const LEX_CSTRING get_column_name() const { return m_column_name; }
344 
345   /**
346     @return type of this histogram
347   */
get_histogram_type()348   enum_histogram_type get_histogram_type() const { return m_hist_type; }
349 
350   /**
351     @return the fraction of NULL values, in the range [0.0, 1.0]
352   */
353   double get_null_values_fraction() const;
354 
355   /// @return the character set for the data this histogram contains
get_character_set()356   const CHARSET_INFO *get_character_set() const { return m_charset; }
357 
358   /// @return the sampling rate used to generate this histogram
get_sampling_rate()359   double get_sampling_rate() const { return m_sampling_rate; }
360 
361   /**
362     Returns the histogram type as a readable string.
363 
364     @return a readable string representation of the histogram type
365   */
366   virtual std::string histogram_type_to_str() const = 0;
367 
368   /**
369     @return number of buckets in this histogram
370   */
371   virtual size_t get_num_buckets() const = 0;
372 
373   /**
374     Get the estimated number of distinct non-NULL values.
375     @return number of distinct non-NULL values
376   */
377   virtual size_t get_num_distinct_values() const = 0;
378 
379   /**
380     @return the data type that this histogram contains
381   */
get_data_type()382   Value_map_type get_data_type() const { return m_data_type; }
383 
384   /**
385     @return number of buckets originally specified by the user. This may be
386             higher than the actual number of buckets in the histogram.
387   */
get_num_buckets_specified()388   size_t get_num_buckets_specified() const { return m_num_buckets_specified; }
389 
390   /**
391     Converts the histogram to a JSON object.
392 
393     @param[in,out] json_object output where the histogram is to be stored. The
394                    caller is responsible for allocating/deallocating the JSON
395                    object
396 
397     @return     true on error, false otherwise
398   */
399   virtual bool histogram_to_json(Json_object *json_object) const = 0;
400 
401   /**
402     Converts JSON object to a histogram.
403 
404     @param  mem_root    MEM_ROOT where the histogram will be allocated
405     @param  schema_name the schema name
406     @param  table_name  the table name
407     @param  column_name the column name
408     @param  json_object output where the histogram is stored
409 
410     @return nullptr on error. Otherwise a histogram allocated on the provided
411             MEM_ROOT.
412   */
413   static Histogram *json_to_histogram(MEM_ROOT *mem_root,
414                                       const std::string &schema_name,
415                                       const std::string &table_name,
416                                       const std::string &column_name,
417                                       const Json_object &json_object);
418 
419   /**
420     Make a clone of the current histogram
421 
422     @param mem_root the MEM_ROOT on which the new histogram will be allocated.
423 
424     @return a histogram allocated on the provided MEM_ROOT. Returns nullptr
425             on error.
426   */
427   virtual Histogram *clone(MEM_ROOT *mem_root) const = 0;
428 
429   /**
430     Store this histogram to persistent storage (data dictionary).
431 
432     @param thd Thread handler.
433 
434     @return false on success, true on error.
435   */
436   bool store_histogram(THD *thd) const;
437 
438   /**
439     Get selectivity estimation.
440 
441     This function will try and get the selectivity estimation for a predicate
442     on the form "COLUMN OPERATOR CONSTANT", for instance "SELECT * FROM t1
443     WHERE col1 > 23;".
444 
445     This function will take care of several of things, for instance checking
446     that the value we are estimating the selectivity for is a constant value.
447 
448     The order of the Items provided does not matter. For instance, of the
449     operator argument given is "EQUALS_TO", it does not matter if the constant
450     value is provided as the first or the second argument; this function will
451     take care of this.
452 
453     @param items            an array of items that contains both the field we
454                             are estimating the selectivity for, as well as the
455                             user-provided constant values.
456     @param item_count       the number of Items in the Item array.
457     @param op               the predicate operator
458     @param[out] selectivity the calculated selectivity if a usable histogram was
459                             found
460 
461     @retval true if an error occurred (the Item provided was not a constant
462     value or similar).
463     @return false if success
464   */
465   bool get_selectivity(Item **items, size_t item_count, enum_operator op,
466                        double *selectivity) const;
467 
468   /**
469     @return the fraction of non-null values in the histogram.
470   */
get_non_null_values_frequency()471   double get_non_null_values_frequency() const {
472     return 1.0 - get_null_values_fraction();
473   }
474 };
475 
476 /**
477   Create a histogram from a value map.
478 
479   This function will build a histogram from a value map. The histogram type
480   depends on both the size of the input data, as well as the number of buckets
481   specified. If the number of distinct values is less than or equal to the
482   number of buckets, a Singleton histogram will be created. Otherwise, an
483   equi-height histogram will be created.
484 
485   The histogram will be allocated on the supplied mem_root, and it is the
486   callers responsibility to properly clean up when the histogram isn't needed
487   anymore.
488 
489   @param   mem_root        the MEM_ROOT where the histogram contents will be
490                            allocated
491   @param   value_map       a value map containing [value, frequency]
492   @param   num_buckets     the maximum number of buckets to create
493   @param   db_name         name of the database this histogram represents
494   @param   tbl_name        name of the table this histogram represents
495   @param   col_name        name of the column this histogram represents
496 
497   @return  a histogram, using at most "num_buckets" buckets. The histogram
498            type depends on the size of the input data, and the number of
499            buckets
500 */
501 template <class T>
502 Histogram *build_histogram(MEM_ROOT *mem_root, const Value_map<T> &value_map,
503                            size_t num_buckets, const std::string &db_name,
504                            const std::string &tbl_name,
505                            const std::string &col_name);
506 
507 /**
508   Create or update histograms for a set of columns of a given table.
509 
510   This function will try to create histogram statistics for all the columns
511   specified. If one of the columns fail, it will continue to the next one and
512   try.
513 
514   @param thd Thread handler.
515   @param table The table where we should look for the columns/data.
516   @param columns Columns specified by the user.
517   @param num_buckets The maximum number of buckets to create in each
518          histogram.
519   @param results A map where the result of each operation is stored.
520 
521   @return false on success, true on error.
522 */
523 bool update_histogram(THD *thd, TABLE_LIST *table, const columns_set &columns,
524                       int num_buckets, results_map &results);
525 
526 /**
527   Drop histograms for all columns in a given table.
528 
529   @param thd Thread handler.
530   @param table The table where we should look for the columns.
531   @param original_table_def Original table definition.
532   @param results A map where the result of each operation is stored.
533 
534   @return false on success, true on error.
535 */
536 bool drop_all_histograms(THD *thd, const TABLE_LIST &table,
537                          const dd::Table &original_table_def,
538                          results_map &results);
539 
540 /**
541   Drop histograms for a set of columns in a given table.
542 
543   This function will try to drop the histogram statistics for all specified
544   columns. If one of the columns fail, it will continue to the next one and try.
545 
546   @param thd Thread handler.
547   @param table The table where we should look for the columns.
548   @param columns Columns specified by the user.
549   @param results A map where the result of each operation is stored.
550 
551   @return false on success, true on error.
552 */
553 bool drop_histograms(THD *thd, const TABLE_LIST &table,
554                      const columns_set &columns, results_map &results);
555 
556 /**
557   Rename histograms for all columns in a given table.
558 
559   @param thd             Thread handler.
560   @param old_schema_name The old schema name
561   @param old_table_name  The old table name
562   @param new_schema_name The new schema name
563   @param new_table_name  The new table name
564   @param results         A map where the result of each operation is stored.
565 
566   @return false on success, true on error.
567 */
568 bool rename_histograms(THD *thd, const char *old_schema_name,
569                        const char *old_table_name, const char *new_schema_name,
570                        const char *new_table_name, results_map &results);
571 
572 bool find_histogram(THD *thd, const std::string &schema_name,
573                     const std::string &table_name,
574                     const std::string &column_name,
575                     const Histogram **histogram);
576 }  // namespace histograms
577 
578 #endif
579