1 /* 2 Copyright (c) 2009, 2011, Monty Program Ab 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; version 2 of the License. 7 8 This program is distributed in the hope that it will be useful, 9 but WITHOUT ANY WARRANTY; without even the implied warranty of 10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 GNU General Public License for more details. 12 13 You should have received a copy of the GNU General Public License 14 along with this program; if not, write to the Free Software 15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ 16 17 /** 18 @defgroup DS-MRR declarations 19 @{ 20 */ 21 22 /** 23 A Disk-Sweep implementation of MRR Interface (DS-MRR for short) 24 25 This is a "plugin"(*) for storage engines that allows to 26 1. When doing index scans, read table rows in rowid order; 27 2. when making many index lookups, do them in key order and don't 28 lookup the same key value multiple times; 29 3. Do both #1 and #2, when applicable. 30 These changes are expected to speed up query execution for disk-based 31 storage engines running io-bound loads and "big" queries (ie. queries that 32 do joins and enumerate lots of records). 33 34 (*) - only conceptually. No dynamic loading or binary compatibility of any 35 kind. 36 37 General scheme of things: 38 39 SQL Layer code 40 | | | 41 v v v 42 -|---|---|---- handler->multi_range_read_XXX() function calls 43 | | | 44 _____________________________________ 45 / DS-MRR module \ 46 | (order/de-duplicate lookup keys, | 47 | scan indexes in key order, | 48 | order/de-duplicate rowids, | 49 | retrieve full record reads in rowid | 50 | order) | 51 \_____________________________________/ 52 | | | 53 -|---|---|----- handler->read_range_first()/read_range_next(), 54 | | | handler->index_read(), handler->rnd_pos() calls. 55 | | | 56 v v v 57 Storage engine internals 58 59 60 Currently DS-MRR is used by MyISAM, InnoDB and Maria storage engines. 61 Potentially it can be used with any table handler that has disk-based data 62 storage and has better performance when reading data in rowid order. 63 */ 64 65 #include "sql_lifo_buffer.h" 66 67 class DsMrr_impl; 68 class Mrr_ordered_index_reader; 69 70 71 /* A structure with key parameters that's shared among several classes */ 72 class Key_parameters 73 { 74 public: 75 uint key_tuple_length; /* Length of index lookup tuple, in bytes */ 76 key_part_map key_tuple_map; /* keyparts used in index lookup tuples */ 77 78 /* 79 This is 80 = key_tuple_length if we copy keys to buffer 81 = sizeof(void*) if we're using pointers to materialized keys. 82 */ 83 uint key_size_in_keybuf; 84 85 /* TRUE <=> don't copy key values, use pointers to them instead. */ 86 bool use_key_pointers; 87 88 /* TRUE <=> We can get at most one index tuple for a lookup key */ 89 bool index_ranges_unique; 90 }; 91 92 93 /** 94 A class to enumerate (record, range_id) pairs that match given key value. 95 96 @note 97 98 The idea is that we have a Lifo_buffer which holds (key, range_id) pairs 99 ordered by key value. From the front of the buffer we see 100 101 (key_val1, range_id1), (key_val1, range_id2) ... (key_val2, range_idN) 102 103 we take the first elements that have the same key value (key_val1 in the 104 example above), and make lookup into the table. The table will have 105 multiple matches for key_val1: 106 107 == Table Index == 108 ... 109 key_val1 -> key_val1, index_tuple1 110 key_val1, index_tuple2 111 ... 112 key_val1, index_tupleN 113 ... 114 115 Our goal is to produce all possible combinations, i.e. we need: 116 117 {(key_val1, index_tuple1), range_id1} 118 {(key_val1, index_tuple1), range_id2} 119 ... ... | 120 {(key_val1, index_tuple1), range_idN}, 121 122 {(key_val1, index_tuple2), range_id1} 123 {(key_val1, index_tuple2), range_id2} 124 ... ... | 125 {(key_val1, index_tuple2), range_idN}, 126 127 ... ... ... 128 129 {(key_val1, index_tupleK), range_idN} 130 */ 131 132 class Key_value_records_iterator 133 { 134 /* Use this to get table handler, key buffer and other parameters */ 135 Mrr_ordered_index_reader *owner; 136 137 /* Iterator to get (key, range_id) pairs from */ 138 Lifo_buffer_iterator identical_key_it; 139 140 /* 141 Last of the identical key values (when we get this pointer from 142 identical_key_it, it will be time to stop). 143 */ 144 uchar *last_identical_key_ptr; 145 146 /* 147 FALSE <=> we're right after the init() call, the record has been already 148 read with owner->file->index_read_map() call 149 */ 150 bool get_next_row; 151 152 public: 153 int init(Mrr_ordered_index_reader *owner_arg); 154 int get_next(range_id_t *range_info); 155 void move_to_next_key_value(); 156 }; 157 158 159 /* 160 Buffer manager interface. Mrr_reader objects use it to inqure DsMrr_impl 161 to manage buffer space for them. 162 */ 163 typedef struct st_buffer_manager 164 { 165 public: 166 /* Opaque value to be passed as the first argument to all member functions */ 167 void *arg; 168 169 /* 170 This is called when we've freed more space from the rowid buffer. The 171 callee will get the unused space from the rowid buffer and give it to the 172 key buffer. 173 */ 174 void (*redistribute_buffer_space)(void *arg); 175 176 /* 177 This is called when both key and rowid buffers are empty, and so it's time 178 to reset them to their original size (They've lost their original size, 179 because we were dynamically growing rowid buffer and shrinking key buffer). 180 */ 181 void (*reset_buffer_sizes)(void *arg); 182 183 } Buffer_manager; 184 185 186 /* 187 Mrr_reader - DS-MRR execution strategy abstraction 188 189 A reader produces ([index]_record, range_info) pairs, and requires periodic 190 refill operations. 191 192 - one starts using the reader by calling reader->get_next(), 193 - when a get_next() call returns HA_ERR_END_OF_FILE, one must call 194 refill_buffer() before they can make more get_next() calls. 195 - when refill_buffer() returns HA_ERR_END_OF_FILE, this means the real 196 end of stream and get_next() should not be called anymore. 197 198 Both functions can return other error codes, these mean unrecoverable errors 199 after which one cannot continue. 200 */ 201 202 class Mrr_reader 203 { 204 public: 205 virtual int get_next(range_id_t *range_info) = 0; 206 virtual int refill_buffer(bool initial) = 0; ~Mrr_reader()207 virtual ~Mrr_reader() {}; /* just to remove compiler warning */ 208 }; 209 210 211 /* 212 A common base for readers that do index scans and produce index tuples 213 */ 214 215 class Mrr_index_reader : public Mrr_reader 216 { 217 protected: 218 handler *file; /* Handler object to use */ 219 public: 220 virtual int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 221 void *seq_init_param, uint n_ranges, 222 uint mode, Key_parameters *key_par, 223 Lifo_buffer *key_buffer, 224 Buffer_manager *buf_manager_arg) = 0; 225 226 /* Get pointer to place where every get_next() call will put rowid */ 227 virtual uchar *get_rowid_ptr() = 0; 228 /* Get the rowid (call this after get_next() call) */ 229 virtual void position(); 230 virtual bool skip_record(range_id_t range_id, uchar *rowid) = 0; 231 interrupt_read()232 virtual void interrupt_read() {} resume_read()233 virtual void resume_read() {} 234 }; 235 236 237 /* 238 A "bypass" index reader that just does and index scan. The index scan is done 239 by calling default MRR implementation (i.e. handler::multi_range_read_XXX()) 240 functions. 241 */ 242 243 class Mrr_simple_index_reader : public Mrr_index_reader 244 { 245 public: 246 int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 247 void *seq_init_param, uint n_ranges, 248 uint mode, Key_parameters *key_par, 249 Lifo_buffer *key_buffer, 250 Buffer_manager *buf_manager_arg); 251 int get_next(range_id_t *range_info); refill_buffer(bool initial)252 int refill_buffer(bool initial) { return initial? 0: HA_ERR_END_OF_FILE; } get_rowid_ptr()253 uchar *get_rowid_ptr() { return file->ref; } skip_record(range_id_t range_id,uchar * rowid)254 bool skip_record(range_id_t range_id, uchar *rowid) 255 { 256 return (file->mrr_funcs.skip_record && 257 file->mrr_funcs.skip_record(file->mrr_iter, range_id, rowid)); 258 } 259 }; 260 261 262 /* 263 A reader that sorts the key values before it makes the index lookups. 264 */ 265 266 class Mrr_ordered_index_reader : public Mrr_index_reader 267 { 268 public: 269 int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 270 void *seq_init_param, uint n_ranges, 271 uint mode, Key_parameters *key_par, 272 Lifo_buffer *key_buffer, 273 Buffer_manager *buf_manager_arg); 274 int get_next(range_id_t *range_info); 275 int refill_buffer(bool initial); get_rowid_ptr()276 uchar *get_rowid_ptr() { return file->ref; } 277 skip_record(range_id_t range_info,uchar * rowid)278 bool skip_record(range_id_t range_info, uchar *rowid) 279 { 280 return (mrr_funcs.skip_record && 281 mrr_funcs.skip_record(mrr_iter, range_info, rowid)); 282 } 283 skip_index_tuple(range_id_t range_info)284 bool skip_index_tuple(range_id_t range_info) 285 { 286 return (mrr_funcs.skip_index_tuple && 287 mrr_funcs.skip_index_tuple(mrr_iter, range_info)); 288 } 289 290 bool set_interruption_temp_buffer(uint rowid_length, uint key_len, 291 uint saved_pk_len, 292 uchar **space_start, uchar *space_end); 293 void set_no_interruption_temp_buffer(); 294 295 void interrupt_read(); 296 void resume_read(); 297 void position(); 298 private: 299 Key_value_records_iterator kv_it; 300 301 bool scanning_key_val_iter; 302 303 /* Buffer to store (key, range_id) pairs */ 304 Lifo_buffer *key_buffer; 305 306 /* This manages key buffer allocation and sizing for us */ 307 Buffer_manager *buf_manager; 308 309 Key_parameters keypar; /* index scan and lookup tuple parameters */ 310 311 /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ 312 bool is_mrr_assoc; 313 314 /* Range sequence iteration members */ 315 RANGE_SEQ_IF mrr_funcs; 316 range_seq_t mrr_iter; 317 318 /* TRUE == reached eof when enumerating ranges */ 319 bool source_exhausted; 320 321 /* 322 Following members are for interrupt_read()/resume_read(). The idea is that 323 in some cases index scan that is done by this object is interrupted by 324 rnd_pos() calls made by Mrr_ordered_rndpos_reader. The problem is that 325 we're sharing handler->record[0] with that object, and it destroys its 326 contents. 327 We need to save/restore our current 328 - index tuple (for pushed index condition checks) 329 - clustered primary key values (again, for pushed index condition checks) 330 - rowid of the last record we've retrieved (in case this rowid matches 331 multiple ranges and we'll need to return it again) 332 */ 333 bool support_scan_interruptions; 334 /* Space where we save the rowid of the last record we've returned */ 335 uchar *saved_rowid; 336 337 /* TRUE <=> saved_rowid has the last saved rowid */ 338 bool have_saved_rowid; 339 340 uchar *saved_key_tuple; /* Saved current key tuple */ 341 uchar *saved_primary_key; /* Saved current primary key tuple */ 342 343 /* 344 TRUE<=> saved_key_tuple (and saved_primary_key when applicable) have 345 valid values. 346 */ 347 bool read_was_interrupted; 348 349 static int compare_keys(void* arg, uchar* key1, uchar* key2); 350 static int compare_keys_reverse(void* arg, uchar* key1, uchar* key2); 351 352 friend class Key_value_records_iterator; 353 friend class DsMrr_impl; 354 friend class Mrr_ordered_rndpos_reader; 355 }; 356 357 358 /* 359 A reader that gets rowids from an Mrr_index_reader, and then sorts them 360 before getting full records with handler->rndpos() calls. 361 */ 362 363 class Mrr_ordered_rndpos_reader : public Mrr_reader 364 { 365 public: 366 int init(handler *file, Mrr_index_reader *index_reader, uint mode, 367 Lifo_buffer *buf, Rowid_filter *filter); 368 int get_next(range_id_t *range_info); 369 int refill_buffer(bool initial); 370 private: 371 handler *file; /* Handler to use */ 372 373 /* This what we get (rowid, range_info) pairs from */ 374 Mrr_index_reader *index_reader; 375 376 /* index_reader->get_next() puts rowid here */ 377 uchar *index_rowid; 378 379 /* TRUE <=> index_reader->refill_buffer() call has returned EOF */ 380 bool index_reader_exhausted; 381 382 /* 383 TRUE <=> We should call index_reader->refill_buffer(). This happens if 384 1. we've made index_reader->get_next() call which returned EOF 385 2. we haven't made any index_reader calls (and our first call should 386 be index_reader->refill_buffer(initial=TRUE) 387 */ 388 bool index_reader_needs_refill; 389 390 /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ 391 bool is_mrr_assoc; 392 393 /* 394 When reading from ordered rowid buffer: the rowid element of the last 395 buffer element that has rowid identical to this one. 396 */ 397 uchar *last_identical_rowid; 398 399 /* Buffer to store (rowid, range_id) pairs */ 400 Lifo_buffer *rowid_buffer; 401 402 /* Rowid filter to be checked against (if any) */ 403 Rowid_filter *rowid_filter; 404 405 int refill_from_index_reader(); 406 }; 407 408 409 /* 410 A primitive "factory" of various Mrr_*_reader classes (the point is to 411 get various kinds of readers without having to allocate them on the heap) 412 */ 413 414 class Mrr_reader_factory 415 { 416 public: 417 Mrr_ordered_rndpos_reader ordered_rndpos_reader; 418 Mrr_ordered_index_reader ordered_index_reader; 419 Mrr_simple_index_reader simple_index_reader; 420 }; 421 422 423 #define DSMRR_IMPL_SORT_KEYS HA_MRR_IMPLEMENTATION_FLAG1 424 #define DSMRR_IMPL_SORT_ROWIDS HA_MRR_IMPLEMENTATION_FLAG2 425 426 /* 427 DS-MRR implementation for one table. Create/use one object of this class for 428 each ha_{myisam/innobase/etc} object. That object will be further referred to 429 as "the handler" 430 431 DsMrr_impl supports has the following execution strategies: 432 433 - Bypass DS-MRR, pass all calls to default MRR implementation, which is 434 an MRR-to-non-MRR call converter. 435 - Key-Ordered Retrieval 436 - Rowid-Ordered Retrieval 437 438 DsMrr_impl will use one of the above strategies, or a combination of them, 439 according to the following diagram: 440 441 (mrr function calls) 442 | 443 +----------------->-----------------+ 444 | | 445 ___________v______________ _______________v________________ 446 / default: use lookup keys \ / KEY-ORDERED RETRIEVAL: \ 447 | (or ranges) in whatever | | sort lookup keys and then make | 448 | order they are supplied | | index lookups in index order | 449 \__________________________/ \________________________________/ 450 | | | | | 451 +---<---+ | +--------------->-----------|----+ 452 | | | | 453 | | +---------------+ | 454 | ______v___ ______ | _______________v_______________ 455 | / default: read \ | / ROWID-ORDERED RETRIEVAL: \ 456 | | table records | | | Before reading table records, | 457 v | in random order | v | sort their rowids and then | 458 | \_________________/ | | read them in rowid order | 459 | | | \_______________________________/ 460 | | | | 461 | | | | 462 +-->---+ | +----<------+-----------<--------+ 463 | | | 464 v v v 465 (table records and range_ids) 466 467 The choice of strategy depends on MRR scan properties, table properties 468 (whether we're scanning clustered primary key), and @@optimizer_switch 469 settings. 470 471 Key-Ordered Retrieval 472 --------------------- 473 The idea is: if MRR scan is essentially a series of lookups on 474 475 tbl.key=value1 OR tbl.key=value2 OR ... OR tbl.key=valueN 476 477 then it makes sense to collect and order the set of lookup values, i.e. 478 479 sort(value1, value2, .. valueN) 480 481 and then do index lookups in index order. This results in fewer index page 482 fetch operations, and we also can avoid making multiple index lookups for the 483 same value. That is, if value1=valueN we can easily discover that after 484 sorting and make one index lookup for them instead of two. 485 486 Rowid-Ordered Retrieval 487 ----------------------- 488 If we do a regular index scan or a series of index lookups, we'll be hitting 489 table records at random. For disk-based engines, this is much slower than 490 reading the same records in disk order. We assume that disk ordering of 491 rows is the same as ordering of their rowids (which is provided by 492 handler::cmp_ref()) 493 In order to retrieve records in different order, we must separate index 494 scanning and record fetching, that is, MRR scan uses the following steps: 495 496 1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and 497 fill a buffer with {rowid, range_id} pairs 498 2. Sort the buffer by rowid value 499 3. for each {rowid, range_id} pair in the buffer 500 get record by rowid and return the {record, range_id} pair 501 4. Repeat the above steps until we've exhausted the list of ranges we're 502 scanning. 503 504 Buffer space management considerations 505 -------------------------------------- 506 With regards to buffer/memory management, MRR interface specifies that 507 - SQL layer provides multi_range_read_init() with buffer of certain size. 508 - MRR implementation may use (i.e. have at its disposal till the end of 509 the MRR scan) all of the buffer, or return the unused end of the buffer 510 to SQL layer. 511 512 DS-MRR needs buffer in order to accumulate and sort rowids and/or keys. When 513 we need to accumulate/sort only keys (or only rowids), it is fairly trivial. 514 515 When we need to accumulate/sort both keys and rowids, efficient buffer use 516 gets complicated. We need to: 517 - First, accumulate keys and sort them 518 - Then use the keys (smaller values go first) to obtain rowids. A key is not 519 needed after we've got matching rowids for it. 520 - Make sure that rowids are accumulated at the front of the buffer, so that we 521 can return the end part of the buffer to SQL layer, should there be too 522 few rowid values to occupy the buffer. 523 524 All of these goals are achieved by using the following scheme: 525 526 | | We get an empty buffer from SQL layer. 527 528 | *-| 529 | *----| First, we fill the buffer with keys. Key_buffer 530 | *-------| part grows from end of the buffer space to start 531 | *----------| (In this picture, the buffer is big enough to 532 | *-------------| accomodate all keys and even have some space left) 533 534 | *=============| We want to do key-ordered index scan, so we sort 535 the keys 536 537 |-x *===========| Then we use the keys get rowids. Rowids are 538 |----x *========| stored from start of buffer space towards the end. 539 |--------x *=====| The part of the buffer occupied with keys 540 |------------x *===| gradually frees up space for rowids. In this 541 |--------------x *=| picture we run out of keys before we've ran out 542 |----------------x | of buffer space (it can be other way as well). 543 544 |================x | Then we sort the rowids. 545 546 | |~~~| The unused part of the buffer is at the end, so 547 we can return it to the SQL layer. 548 549 |================* Sorted rowids are then used to read table records 550 in disk order 551 552 */ 553 554 class DsMrr_impl 555 { 556 public: 557 typedef void (handler::*range_check_toggle_func_t)(bool on); 558 DsMrr_impl()559 DsMrr_impl() 560 : secondary_file(NULL), 561 rowid_filter(NULL) {}; 562 init(handler * h_arg,TABLE * table_arg)563 void init(handler *h_arg, TABLE *table_arg) 564 { 565 primary_file= h_arg; 566 table= table_arg; 567 } 568 int dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 569 void *seq_init_param, uint n_ranges, uint mode, 570 HANDLER_BUFFER *buf); 571 void dsmrr_close(); 572 int dsmrr_next(range_id_t *range_info); 573 574 ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, 575 uint *bufsz, uint *flags, Cost_estimate *cost); 576 577 ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, 578 void *seq_init_param, uint n_ranges, uint *bufsz, 579 uint *flags, Cost_estimate *cost); 580 581 int dsmrr_explain_info(uint mrr_mode, char *str, size_t size); 582 private: 583 /* Buffer to store (key, range_id) pairs */ 584 Lifo_buffer *key_buffer; 585 586 /* 587 The "owner" handler object (the one that is expected to "own" this object 588 and call its functions). 589 */ 590 handler *primary_file; 591 TABLE *table; /* Always equal to primary_file->table */ 592 593 /* 594 Secondary handler object. (created when needed, we need it when we need 595 to run both index scan and rnd_pos() scan at the same time) 596 */ 597 handler *secondary_file; 598 599 /* 600 The rowid filter that DS-MRR has "unpushed" from the storage engine. 601 If it's present, DS-MRR will use it. 602 */ 603 Rowid_filter *rowid_filter; 604 605 uint keyno; /* index we're running the scan on */ 606 /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ 607 bool is_mrr_assoc; 608 609 Mrr_reader_factory reader_factory; 610 611 Mrr_reader *strategy; 612 bool strategy_exhausted; 613 614 Mrr_index_reader *index_strategy; 615 616 /* The whole buffer space that we're using */ 617 uchar *full_buf; 618 uchar *full_buf_end; 619 620 /* 621 When using both rowid and key buffers: the boundary between key and rowid 622 parts of the buffer. This is the "original" value, actual memory ranges 623 used by key and rowid parts may be different because of dynamic space 624 reallocation between them. 625 */ 626 uchar *rowid_buffer_end; 627 628 /* 629 One of the following two is used for key buffer: forward is used when 630 we only need key buffer, backward is used when we need both key and rowid 631 buffers. 632 */ 633 Forward_lifo_buffer forward_key_buf; 634 Backward_lifo_buffer backward_key_buf; 635 636 /* 637 Buffer to store (rowid, range_id) pairs, or just rowids if 638 is_mrr_assoc==FALSE 639 */ 640 Forward_lifo_buffer rowid_buffer; 641 642 bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 643 Cost_estimate *cost); 644 bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 645 uint *buffer_size, uint extra_mem_overhead, 646 Cost_estimate *cost); 647 bool check_cpk_scan(THD *thd, TABLE_SHARE *share, uint keyno, uint mrr_flags); 648 649 bool setup_buffer_sharing(uint key_size_in_keybuf, key_part_map key_tuple_map); 650 651 /* Buffer_manager and its member functions */ 652 Buffer_manager buf_manager; 653 static void redistribute_buffer_space(void *dsmrr_arg); 654 static void reset_buffer_sizes(void *dsmrr_arg); 655 static void do_nothing(void *dsmrr_arg); 656 get_key_buffer()657 Lifo_buffer* get_key_buffer() { return key_buffer; } 658 659 friend class Key_value_records_iterator; 660 friend class Mrr_ordered_index_reader; 661 friend class Mrr_ordered_rndpos_reader; 662 663 int setup_two_handlers(); 664 void close_second_handler(); 665 }; 666 667 /** 668 @} (end of group DS-MRR declarations) 669 */ 670 671