1 #ifndef HISTOGRAMS_HISTOGRAM_INCLUDED 2 #define HISTOGRAMS_HISTOGRAM_INCLUDED 3 4 /* Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License, version 2.0, 8 as published by the Free Software Foundation. 9 10 This program is also distributed with certain software (including 11 but not limited to OpenSSL) that is licensed under separate terms, 12 as designated in a particular file or component or in included license 13 documentation. The authors of MySQL hereby grant you an additional 14 permission to link the program and your derivative works with the 15 separately licensed software that they have included with MySQL. 16 17 This program is distributed in the hope that it will be useful, 18 but WITHOUT ANY WARRANTY; without even the implied warranty of 19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 GNU General Public License, version 2.0, for more details. 21 22 You should have received a copy of the GNU General Public License 23 along with this program; if not, write to the Free Software 24 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ 25 26 /** 27 @file sql/histograms/histogram.h 28 Histogram base class. 29 30 This file defines the base class for all histogram types. We keep the base 31 class itself non-templatized in order to more easily send a histogram as an 32 argument, collect multiple histograms in a single collection etc. 33 34 A histogram is stored as a JSON object. This gives the flexibility of storing 35 virtually an unlimited number of buckets, data values in its full length and 36 easily expanding with new histogram types in the future. They are stored 37 persistently in the system table mysql.column_stats. 38 39 We keep all histogram code in the namespace "histograms" in order to avoid 40 name conflicts etc. 41 */ 42 43 #include <cstddef> // size_t 44 #include <functional> 45 #include <map> // std::map 46 #include <set> // std::set 47 #include <string> // std::string 48 #include <utility> // std::pair 49 50 #include "lex_string.h" // LEX_CSTRING 51 #include "my_base.h" // ha_rows 52 #include "sql/histograms/value_map_type.h" 53 #include "sql/mem_root_allocator.h" // Mem_root_allocator 54 #include "sql/stateless_allocator.h" // Stateless_allocator 55 56 class Item; 57 class Json_dom; 58 class Json_object; 59 class THD; 60 struct TYPELIB; 61 62 namespace dd { 63 class Table; 64 } // namespace dd 65 namespace histograms { 66 struct Histogram_comparator; 67 template <class T> 68 class Value_map; 69 } // namespace histograms 70 struct CHARSET_INFO; 71 struct MEM_ROOT; 72 struct TABLE_LIST; 73 74 namespace histograms { 75 76 /// The default (and invalid) value for "m_null_values_fraction". 77 static const double INVALID_NULL_VALUES_FRACTION = -1.0; 78 79 enum class Message { 80 FIELD_NOT_FOUND, 81 UNSUPPORTED_DATA_TYPE, 82 TEMPORARY_TABLE, 83 ENCRYPTED_TABLE, 84 VIEW, 85 HISTOGRAM_CREATED, 86 MULTIPLE_TABLES_SPECIFIED, 87 COVERED_BY_SINGLE_PART_UNIQUE_INDEX, 88 NO_HISTOGRAM_FOUND, 89 HISTOGRAM_DELETED, 90 SERVER_READ_ONLY 91 }; 92 93 struct Histogram_psi_key_alloc { 94 void *operator()(size_t s) const; 95 }; 96 97 template <class T> 98 using Histogram_key_allocator = Stateless_allocator<T, Histogram_psi_key_alloc>; 99 100 template <class T> 101 using value_map_allocator = Mem_root_allocator<std::pair<const T, ha_rows>>; 102 103 template <typename T> 104 using value_map_type = 105 std::map<T, ha_rows, Histogram_comparator, value_map_allocator<T>>; 106 107 using columns_set = std::set<std::string, std::less<std::string>, 108 Histogram_key_allocator<std::string>>; 109 110 using results_map = 111 std::map<std::string, Message, std::less<std::string>, 112 Histogram_key_allocator<std::pair<const std::string, Message>>>; 113 114 /** 115 The different operators we can ask histogram statistics for selectivity 116 estimations. 117 */ 118 enum class enum_operator { 119 EQUALS_TO, 120 GREATER_THAN, 121 LESS_THAN, 122 IS_NULL, 123 IS_NOT_NULL, 124 LESS_THAN_OR_EQUAL, 125 GREATER_THAN_OR_EQUAL, 126 NOT_EQUALS_TO, 127 BETWEEN, 128 NOT_BETWEEN, 129 IN_LIST, 130 NOT_IN_LIST 131 }; 132 133 /** 134 Histogram base class. 135 */ 136 class Histogram { 137 public: 138 /// All supported histogram types in MySQL. 139 enum class enum_histogram_type { EQUI_HEIGHT, SINGLETON }; 140 141 /// String representation of the JSON field "histogram-type". histogram_type_str()142 static constexpr const char *histogram_type_str() { return "histogram-type"; } 143 144 /// String representation of the JSON field "data-type". data_type_str()145 static constexpr const char *data_type_str() { return "data-type"; } 146 147 /// String representation of the JSON field "collation-id". collation_id_str()148 static constexpr const char *collation_id_str() { return "collation-id"; } 149 150 /// String representation of the histogram type SINGLETON. singleton_str()151 static constexpr const char *singleton_str() { return "singleton"; } 152 153 /// String representation of the histogram type EQUI-HEIGHT. equi_height_str()154 static constexpr const char *equi_height_str() { return "equi-height"; } 155 156 protected: 157 double m_sampling_rate; 158 159 /// The fraction of NULL values in the histogram (between 0.0 and 1.0). 160 double m_null_values_fraction; 161 162 /// The character set for the data stored 163 const CHARSET_INFO *m_charset; 164 165 /// The number of buckets originally specified 166 size_t m_num_buckets_specified; 167 168 /// String representation of the JSON field "buckets". buckets_str()169 static constexpr const char *buckets_str() { return "buckets"; } 170 171 /// String representation of the JSON field "last-updated". last_updated_str()172 static constexpr const char *last_updated_str() { return "last-updated"; } 173 174 /// String representation of the JSON field "null-values". null_values_str()175 static constexpr const char *null_values_str() { return "null-values"; } 176 sampling_rate_str()177 static constexpr const char *sampling_rate_str() { return "sampling-rate"; } 178 179 /// String representation of the JSON field "number-of-buckets-specified". numer_of_buckets_specified_str()180 static constexpr const char *numer_of_buckets_specified_str() { 181 return "number-of-buckets-specified"; 182 } 183 184 /** 185 Write the data type of this histogram into a JSON object. 186 187 @param json_object the JSON object where we will write the histogram 188 data type 189 190 @return true on error, false otherwise 191 */ 192 bool histogram_data_type_to_json(Json_object *json_object) const; 193 194 /** 195 Return the value that is contained in the JSON DOM object. 196 197 For most types, this function simply returns the contained value. For String 198 values, the value is allocated on this histograms MEM_ROOT before it is 199 returned. This allows the String value to survive the entire lifetime of the 200 histogram object. 201 202 @param json_dom the JSON DOM object to extract the value from 203 @param out the value from the JSON DOM object 204 205 @return true on error, false otherwise 206 */ 207 template <class T> 208 bool extract_json_dom_value(const Json_dom *json_dom, T *out); 209 210 /** 211 Populate the histogram with data from the provided JSON object. The base 212 class also provides an implementation that subclasses must call in order 213 to populate fields that are shared among all histogram types (character set, 214 null values fraction). 215 216 @param json_object the JSON object to read the histogram data from 217 218 @return true on error, false otherwise 219 */ 220 virtual bool json_to_histogram(const Json_object &json_object) = 0; 221 222 private: 223 /// The MEM_ROOT where the histogram contents will be allocated. 224 MEM_ROOT *m_mem_root; 225 226 /// The type of this histogram. 227 const enum_histogram_type m_hist_type; 228 229 /// The type of the data this histogram contains. 230 const Value_map_type m_data_type; 231 232 /// Name of the database this histogram represents. 233 LEX_CSTRING m_database_name; 234 235 /// Name of the table this histogram represents. 236 LEX_CSTRING m_table_name; 237 238 /// Name of the column this histogram represents. 239 LEX_CSTRING m_column_name; 240 241 /** 242 An internal function for getting the selecitvity estimation. 243 244 This function will read/evaluate the value from the given Item, and pass 245 this value on to the correct selectivity estimation function based on the 246 data type of the histogram. For instance, if the data type of the histogram 247 is INT, we will call "val_int" on the Item to evaulate the value as an 248 integer and pass this value on to the next function. 249 250 @param item The Item to read/evaluate the value from. 251 @param op The operator we are estimating the selectivity for. 252 @param typelib In the case of ENUM or SET data type, this parameter holds 253 the type information. This is needed in order to map a 254 string representation of an ENUM/SET value into its correct 255 integer representation (ENUM/SET values are stored as 256 integer values in the histogram). 257 @param[out] selectivity The estimated selectivity, between 0.0 and 1.0 258 inclusive. 259 260 @return true on error (i.e the provided item was NULL), false on success. 261 */ 262 bool get_selectivity_dispatcher(Item *item, const enum_operator op, 263 const TYPELIB *typelib, 264 double *selectivity) const; 265 266 /** 267 An internal function for getting the selecitvity estimation. 268 269 This function will cast the histogram to the correct class (using down_cast) 270 and pass the given value on to the correct selectivity estimation function 271 for that class. 272 273 @param value The value to estimate the selectivity for. 274 275 @return The estimated selectivity, between 0.0 and 1.0 inclusive. 276 */ 277 template <class T> 278 double get_less_than_selectivity_dispatcher(const T &value) const; 279 280 /// @see get_less_than_selectivity_dispatcher 281 template <class T> 282 double get_greater_than_selectivity_dispatcher(const T &value) const; 283 284 /// @see get_less_than_selectivity_dispatcher 285 template <class T> 286 double get_equal_to_selectivity_dispatcher(const T &value) const; 287 288 /** 289 An internal function for applying the correct function for the given 290 operator. 291 292 @param op The operator to apply 293 @param value The value to find the selectivity for. 294 295 @return The estimated selectivity, between 0.0 and 1.0 inclusive. 296 */ 297 template <class T> 298 double apply_operator(const enum_operator op, const T &value) const; 299 300 public: 301 /** 302 Constructor. 303 304 @param mem_root the mem_root where the histogram contents will be allocated 305 @param db_name name of the database this histogram represents 306 @param tbl_name name of the table this histogram represents 307 @param col_name name of the column this histogram represents 308 @param type the histogram type (equi-height, singleton) 309 @param data_type the type of data that this histogram contains 310 */ 311 Histogram(MEM_ROOT *mem_root, const std::string &db_name, 312 const std::string &tbl_name, const std::string &col_name, 313 enum_histogram_type type, Value_map_type data_type); 314 315 /** 316 Copy constructor 317 318 This will make a copy of the provided histogram onto the provided MEM_ROOT. 319 */ 320 Histogram(MEM_ROOT *mem_root, const Histogram &other); 321 322 Histogram(const Histogram &other) = delete; 323 324 /// Destructor. ~Histogram()325 virtual ~Histogram() {} 326 327 /// @return the MEM_ROOT that this histogram uses for allocations get_mem_root()328 MEM_ROOT *get_mem_root() const { return m_mem_root; } 329 330 /** 331 @return name of the database this histogram represents 332 */ get_database_name()333 const LEX_CSTRING get_database_name() const { return m_database_name; } 334 335 /** 336 @return name of the table this histogram represents 337 */ get_table_name()338 const LEX_CSTRING get_table_name() const { return m_table_name; } 339 340 /** 341 @return name of the column this histogram represents 342 */ get_column_name()343 const LEX_CSTRING get_column_name() const { return m_column_name; } 344 345 /** 346 @return type of this histogram 347 */ get_histogram_type()348 enum_histogram_type get_histogram_type() const { return m_hist_type; } 349 350 /** 351 @return the fraction of NULL values, in the range [0.0, 1.0] 352 */ 353 double get_null_values_fraction() const; 354 355 /// @return the character set for the data this histogram contains get_character_set()356 const CHARSET_INFO *get_character_set() const { return m_charset; } 357 358 /// @return the sampling rate used to generate this histogram get_sampling_rate()359 double get_sampling_rate() const { return m_sampling_rate; } 360 361 /** 362 Returns the histogram type as a readable string. 363 364 @return a readable string representation of the histogram type 365 */ 366 virtual std::string histogram_type_to_str() const = 0; 367 368 /** 369 @return number of buckets in this histogram 370 */ 371 virtual size_t get_num_buckets() const = 0; 372 373 /** 374 Get the estimated number of distinct non-NULL values. 375 @return number of distinct non-NULL values 376 */ 377 virtual size_t get_num_distinct_values() const = 0; 378 379 /** 380 @return the data type that this histogram contains 381 */ get_data_type()382 Value_map_type get_data_type() const { return m_data_type; } 383 384 /** 385 @return number of buckets originally specified by the user. This may be 386 higher than the actual number of buckets in the histogram. 387 */ get_num_buckets_specified()388 size_t get_num_buckets_specified() const { return m_num_buckets_specified; } 389 390 /** 391 Converts the histogram to a JSON object. 392 393 @param[in,out] json_object output where the histogram is to be stored. The 394 caller is responsible for allocating/deallocating the JSON 395 object 396 397 @return true on error, false otherwise 398 */ 399 virtual bool histogram_to_json(Json_object *json_object) const = 0; 400 401 /** 402 Converts JSON object to a histogram. 403 404 @param mem_root MEM_ROOT where the histogram will be allocated 405 @param schema_name the schema name 406 @param table_name the table name 407 @param column_name the column name 408 @param json_object output where the histogram is stored 409 410 @return nullptr on error. Otherwise a histogram allocated on the provided 411 MEM_ROOT. 412 */ 413 static Histogram *json_to_histogram(MEM_ROOT *mem_root, 414 const std::string &schema_name, 415 const std::string &table_name, 416 const std::string &column_name, 417 const Json_object &json_object); 418 419 /** 420 Make a clone of the current histogram 421 422 @param mem_root the MEM_ROOT on which the new histogram will be allocated. 423 424 @return a histogram allocated on the provided MEM_ROOT. Returns nullptr 425 on error. 426 */ 427 virtual Histogram *clone(MEM_ROOT *mem_root) const = 0; 428 429 /** 430 Store this histogram to persistent storage (data dictionary). 431 432 @param thd Thread handler. 433 434 @return false on success, true on error. 435 */ 436 bool store_histogram(THD *thd) const; 437 438 /** 439 Get selectivity estimation. 440 441 This function will try and get the selectivity estimation for a predicate 442 on the form "COLUMN OPERATOR CONSTANT", for instance "SELECT * FROM t1 443 WHERE col1 > 23;". 444 445 This function will take care of several of things, for instance checking 446 that the value we are estimating the selectivity for is a constant value. 447 448 The order of the Items provided does not matter. For instance, of the 449 operator argument given is "EQUALS_TO", it does not matter if the constant 450 value is provided as the first or the second argument; this function will 451 take care of this. 452 453 @param items an array of items that contains both the field we 454 are estimating the selectivity for, as well as the 455 user-provided constant values. 456 @param item_count the number of Items in the Item array. 457 @param op the predicate operator 458 @param[out] selectivity the calculated selectivity if a usable histogram was 459 found 460 461 @retval true if an error occurred (the Item provided was not a constant 462 value or similar). 463 @return false if success 464 */ 465 bool get_selectivity(Item **items, size_t item_count, enum_operator op, 466 double *selectivity) const; 467 468 /** 469 @return the fraction of non-null values in the histogram. 470 */ get_non_null_values_frequency()471 double get_non_null_values_frequency() const { 472 return 1.0 - get_null_values_fraction(); 473 } 474 }; 475 476 /** 477 Create a histogram from a value map. 478 479 This function will build a histogram from a value map. The histogram type 480 depends on both the size of the input data, as well as the number of buckets 481 specified. If the number of distinct values is less than or equal to the 482 number of buckets, a Singleton histogram will be created. Otherwise, an 483 equi-height histogram will be created. 484 485 The histogram will be allocated on the supplied mem_root, and it is the 486 callers responsibility to properly clean up when the histogram isn't needed 487 anymore. 488 489 @param mem_root the MEM_ROOT where the histogram contents will be 490 allocated 491 @param value_map a value map containing [value, frequency] 492 @param num_buckets the maximum number of buckets to create 493 @param db_name name of the database this histogram represents 494 @param tbl_name name of the table this histogram represents 495 @param col_name name of the column this histogram represents 496 497 @return a histogram, using at most "num_buckets" buckets. The histogram 498 type depends on the size of the input data, and the number of 499 buckets 500 */ 501 template <class T> 502 Histogram *build_histogram(MEM_ROOT *mem_root, const Value_map<T> &value_map, 503 size_t num_buckets, const std::string &db_name, 504 const std::string &tbl_name, 505 const std::string &col_name); 506 507 /** 508 Create or update histograms for a set of columns of a given table. 509 510 This function will try to create histogram statistics for all the columns 511 specified. If one of the columns fail, it will continue to the next one and 512 try. 513 514 @param thd Thread handler. 515 @param table The table where we should look for the columns/data. 516 @param columns Columns specified by the user. 517 @param num_buckets The maximum number of buckets to create in each 518 histogram. 519 @param results A map where the result of each operation is stored. 520 521 @return false on success, true on error. 522 */ 523 bool update_histogram(THD *thd, TABLE_LIST *table, const columns_set &columns, 524 int num_buckets, results_map &results); 525 526 /** 527 Drop histograms for all columns in a given table. 528 529 @param thd Thread handler. 530 @param table The table where we should look for the columns. 531 @param original_table_def Original table definition. 532 @param results A map where the result of each operation is stored. 533 534 @return false on success, true on error. 535 */ 536 bool drop_all_histograms(THD *thd, const TABLE_LIST &table, 537 const dd::Table &original_table_def, 538 results_map &results); 539 540 /** 541 Drop histograms for a set of columns in a given table. 542 543 This function will try to drop the histogram statistics for all specified 544 columns. If one of the columns fail, it will continue to the next one and try. 545 546 @param thd Thread handler. 547 @param table The table where we should look for the columns. 548 @param columns Columns specified by the user. 549 @param results A map where the result of each operation is stored. 550 551 @return false on success, true on error. 552 */ 553 bool drop_histograms(THD *thd, const TABLE_LIST &table, 554 const columns_set &columns, results_map &results); 555 556 /** 557 Rename histograms for all columns in a given table. 558 559 @param thd Thread handler. 560 @param old_schema_name The old schema name 561 @param old_table_name The old table name 562 @param new_schema_name The new schema name 563 @param new_table_name The new table name 564 @param results A map where the result of each operation is stored. 565 566 @return false on success, true on error. 567 */ 568 bool rename_histograms(THD *thd, const char *old_schema_name, 569 const char *old_table_name, const char *new_schema_name, 570 const char *new_table_name, results_map &results); 571 572 bool find_histogram(THD *thd, const std::string &schema_name, 573 const std::string &table_name, 574 const std::string &column_name, 575 const Histogram **histogram); 576 } // namespace histograms 577 578 #endif 579