1 /* 2 Copyright (c) 2009, 2011, Monty Program Ab 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; version 2 of the License. 7 8 This program is distributed in the hope that it will be useful, 9 but WITHOUT ANY WARRANTY; without even the implied warranty of 10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 GNU General Public License for more details. 12 13 You should have received a copy of the GNU General Public License 14 along with this program; if not, write to the Free Software 15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ 16 17 /** 18 @defgroup DS-MRR declarations 19 @{ 20 */ 21 22 /** 23 A Disk-Sweep implementation of MRR Interface (DS-MRR for short) 24 25 This is a "plugin"(*) for storage engines that allows to 26 1. When doing index scans, read table rows in rowid order; 27 2. when making many index lookups, do them in key order and don't 28 lookup the same key value multiple times; 29 3. Do both #1 and #2, when applicable. 30 These changes are expected to speed up query execution for disk-based 31 storage engines running io-bound loads and "big" queries (ie. queries that 32 do joins and enumerate lots of records). 33 34 (*) - only conceptually. No dynamic loading or binary compatibility of any 35 kind. 36 37 General scheme of things: 38 39 SQL Layer code 40 | | | 41 v v v 42 -|---|---|---- handler->multi_range_read_XXX() function calls 43 | | | 44 _____________________________________ 45 / DS-MRR module \ 46 | (order/de-duplicate lookup keys, | 47 | scan indexes in key order, | 48 | order/de-duplicate rowids, | 49 | retrieve full record reads in rowid | 50 | order) | 51 \_____________________________________/ 52 | | | 53 -|---|---|----- handler->read_range_first()/read_range_next(), 54 | | | handler->index_read(), handler->rnd_pos() calls. 55 | | | 56 v v v 57 Storage engine internals 58 59 60 Currently DS-MRR is used by MyISAM, InnoDB and Maria storage engines. 61 Potentially it can be used with any table handler that has disk-based data 62 storage and has better performance when reading data in rowid order. 63 */ 64 65 #include "sql_lifo_buffer.h" 66 67 class DsMrr_impl; 68 class Mrr_ordered_index_reader; 69 70 71 /* A structure with key parameters that's shared among several classes */ 72 class Key_parameters 73 { 74 public: 75 uint key_tuple_length; /* Length of index lookup tuple, in bytes */ 76 key_part_map key_tuple_map; /* keyparts used in index lookup tuples */ 77 78 /* 79 This is 80 = key_tuple_length if we copy keys to buffer 81 = sizeof(void*) if we're using pointers to materialized keys. 82 */ 83 uint key_size_in_keybuf; 84 85 /* TRUE <=> don't copy key values, use pointers to them instead. */ 86 bool use_key_pointers; 87 88 /* TRUE <=> We can get at most one index tuple for a lookup key */ 89 bool index_ranges_unique; 90 }; 91 92 93 /** 94 A class to enumerate (record, range_id) pairs that match given key value. 95 96 @note 97 98 The idea is that we have a Lifo_buffer which holds (key, range_id) pairs 99 ordered by key value. From the front of the buffer we see 100 101 (key_val1, range_id1), (key_val1, range_id2) ... (key_val2, range_idN) 102 103 we take the first elements that have the same key value (key_val1 in the 104 example above), and make lookup into the table. The table will have 105 multiple matches for key_val1: 106 107 == Table Index == 108 ... 109 key_val1 -> key_val1, index_tuple1 110 key_val1, index_tuple2 111 ... 112 key_val1, index_tupleN 113 ... 114 115 Our goal is to produce all possible combinations, i.e. we need: 116 117 {(key_val1, index_tuple1), range_id1} 118 {(key_val1, index_tuple1), range_id2} 119 ... ... | 120 {(key_val1, index_tuple1), range_idN}, 121 122 {(key_val1, index_tuple2), range_id1} 123 {(key_val1, index_tuple2), range_id2} 124 ... ... | 125 {(key_val1, index_tuple2), range_idN}, 126 127 ... ... ... 128 129 {(key_val1, index_tupleK), range_idN} 130 */ 131 132 class Key_value_records_iterator 133 { 134 /* Use this to get table handler, key buffer and other parameters */ 135 Mrr_ordered_index_reader *owner; 136 137 /* Iterator to get (key, range_id) pairs from */ 138 Lifo_buffer_iterator identical_key_it; 139 140 /* 141 Last of the identical key values (when we get this pointer from 142 identical_key_it, it will be time to stop). 143 */ 144 uchar *last_identical_key_ptr; 145 146 /* 147 FALSE <=> we're right after the init() call, the record has been already 148 read with owner->file->index_read_map() call 149 */ 150 bool get_next_row; 151 152 public: 153 int init(Mrr_ordered_index_reader *owner_arg); 154 int get_next(range_id_t *range_info); 155 void move_to_next_key_value(); 156 }; 157 158 159 /* 160 Buffer manager interface. Mrr_reader objects use it to inqure DsMrr_impl 161 to manage buffer space for them. 162 */ 163 typedef struct st_buffer_manager 164 { 165 public: 166 /* Opaque value to be passed as the first argument to all member functions */ 167 void *arg; 168 169 /* 170 This is called when we've freed more space from the rowid buffer. The 171 callee will get the unused space from the rowid buffer and give it to the 172 key buffer. 173 */ 174 void (*redistribute_buffer_space)(void *arg); 175 176 /* 177 This is called when both key and rowid buffers are empty, and so it's time 178 to reset them to their original size (They've lost their original size, 179 because we were dynamically growing rowid buffer and shrinking key buffer). 180 */ 181 void (*reset_buffer_sizes)(void *arg); 182 183 } Buffer_manager; 184 185 186 /* 187 Mrr_reader - DS-MRR execution strategy abstraction 188 189 A reader produces ([index]_record, range_info) pairs, and requires periodic 190 refill operations. 191 192 - one starts using the reader by calling reader->get_next(), 193 - when a get_next() call returns HA_ERR_END_OF_FILE, one must call 194 refill_buffer() before they can make more get_next() calls. 195 - when refill_buffer() returns HA_ERR_END_OF_FILE, this means the real 196 end of stream and get_next() should not be called anymore. 197 198 Both functions can return other error codes, these mean unrecoverable errors 199 after which one cannot continue. 200 */ 201 202 class Mrr_reader 203 { 204 public: 205 virtual int get_next(range_id_t *range_info) = 0; 206 virtual int refill_buffer(bool initial) = 0; ~Mrr_reader()207 virtual ~Mrr_reader() {}; /* just to remove compiler warning */ 208 }; 209 210 211 /* 212 A common base for readers that do index scans and produce index tuples 213 */ 214 215 class Mrr_index_reader : public Mrr_reader 216 { 217 protected: 218 handler *file; /* Handler object to use */ 219 public: 220 virtual int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 221 void *seq_init_param, uint n_ranges, 222 uint mode, Key_parameters *key_par, 223 Lifo_buffer *key_buffer, 224 Buffer_manager *buf_manager_arg) = 0; 225 226 /* Get pointer to place where every get_next() call will put rowid */ 227 virtual uchar *get_rowid_ptr() = 0; 228 /* Get the rowid (call this after get_next() call) */ 229 virtual void position(); 230 virtual bool skip_record(range_id_t range_id, uchar *rowid) = 0; 231 interrupt_read()232 virtual void interrupt_read() {} resume_read()233 virtual void resume_read() {} 234 }; 235 236 237 /* 238 A "bypass" index reader that just does and index scan. The index scan is done 239 by calling default MRR implementation (i.e. handler::multi_range_read_XXX()) 240 functions. 241 */ 242 243 class Mrr_simple_index_reader : public Mrr_index_reader 244 { 245 public: 246 int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 247 void *seq_init_param, uint n_ranges, 248 uint mode, Key_parameters *key_par, 249 Lifo_buffer *key_buffer, 250 Buffer_manager *buf_manager_arg); 251 int get_next(range_id_t *range_info); refill_buffer(bool initial)252 int refill_buffer(bool initial) { return initial? 0: HA_ERR_END_OF_FILE; } get_rowid_ptr()253 uchar *get_rowid_ptr() { return file->ref; } skip_record(range_id_t range_id,uchar * rowid)254 bool skip_record(range_id_t range_id, uchar *rowid) 255 { 256 return (file->mrr_funcs.skip_record && 257 file->mrr_funcs.skip_record(file->mrr_iter, range_id, rowid)); 258 } 259 }; 260 261 262 /* 263 A reader that sorts the key values before it makes the index lookups. 264 */ 265 266 class Mrr_ordered_index_reader : public Mrr_index_reader 267 { 268 public: 269 int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 270 void *seq_init_param, uint n_ranges, 271 uint mode, Key_parameters *key_par, 272 Lifo_buffer *key_buffer, 273 Buffer_manager *buf_manager_arg); 274 int get_next(range_id_t *range_info); 275 int refill_buffer(bool initial); get_rowid_ptr()276 uchar *get_rowid_ptr() { return file->ref; } 277 skip_record(range_id_t range_info,uchar * rowid)278 bool skip_record(range_id_t range_info, uchar *rowid) 279 { 280 return (mrr_funcs.skip_record && 281 mrr_funcs.skip_record(mrr_iter, range_info, rowid)); 282 } 283 skip_index_tuple(range_id_t range_info)284 bool skip_index_tuple(range_id_t range_info) 285 { 286 return (mrr_funcs.skip_index_tuple && 287 mrr_funcs.skip_index_tuple(mrr_iter, range_info)); 288 } 289 290 bool set_interruption_temp_buffer(uint rowid_length, uint key_len, 291 uint saved_pk_len, 292 uchar **space_start, uchar *space_end); 293 void set_no_interruption_temp_buffer(); 294 295 void interrupt_read(); 296 void resume_read(); 297 void position(); 298 private: 299 Key_value_records_iterator kv_it; 300 301 bool scanning_key_val_iter; 302 303 /* Buffer to store (key, range_id) pairs */ 304 Lifo_buffer *key_buffer; 305 306 /* This manages key buffer allocation and sizing for us */ 307 Buffer_manager *buf_manager; 308 309 Key_parameters keypar; /* index scan and lookup tuple parameters */ 310 311 /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ 312 bool is_mrr_assoc; 313 314 /* Range sequence iteration members */ 315 RANGE_SEQ_IF mrr_funcs; 316 range_seq_t mrr_iter; 317 318 /* TRUE == reached eof when enumerating ranges */ 319 bool source_exhausted; 320 321 /* 322 Following members are for interrupt_read()/resume_read(). The idea is that 323 in some cases index scan that is done by this object is interrupted by 324 rnd_pos() calls made by Mrr_ordered_rndpos_reader. The problem is that 325 we're sharing handler->record[0] with that object, and it destroys its 326 contents. 327 We need to save/restore our current 328 - index tuple (for pushed index condition checks) 329 - clustered primary key values (again, for pushed index condition checks) 330 - rowid of the last record we've retrieved (in case this rowid matches 331 multiple ranges and we'll need to return it again) 332 */ 333 bool support_scan_interruptions; 334 /* Space where we save the rowid of the last record we've returned */ 335 uchar *saved_rowid; 336 337 /* TRUE <=> saved_rowid has the last saved rowid */ 338 bool have_saved_rowid; 339 340 uchar *saved_key_tuple; /* Saved current key tuple */ 341 uchar *saved_primary_key; /* Saved current primary key tuple */ 342 343 /* 344 TRUE<=> saved_key_tuple (and saved_primary_key when applicable) have 345 valid values. 346 */ 347 bool read_was_interrupted; 348 349 static int compare_keys(void* arg, uchar* key1, uchar* key2); 350 static int compare_keys_reverse(void* arg, uchar* key1, uchar* key2); 351 352 friend class Key_value_records_iterator; 353 friend class DsMrr_impl; 354 friend class Mrr_ordered_rndpos_reader; 355 }; 356 357 358 /* 359 A reader that gets rowids from an Mrr_index_reader, and then sorts them 360 before getting full records with handler->rndpos() calls. 361 */ 362 363 class Mrr_ordered_rndpos_reader : public Mrr_reader 364 { 365 public: 366 int init(handler *file, Mrr_index_reader *index_reader, uint mode, 367 Lifo_buffer *buf); 368 int get_next(range_id_t *range_info); 369 int refill_buffer(bool initial); 370 private: 371 handler *file; /* Handler to use */ 372 373 /* This what we get (rowid, range_info) pairs from */ 374 Mrr_index_reader *index_reader; 375 376 /* index_reader->get_next() puts rowid here */ 377 uchar *index_rowid; 378 379 /* TRUE <=> index_reader->refill_buffer() call has returned EOF */ 380 bool index_reader_exhausted; 381 382 /* 383 TRUE <=> We should call index_reader->refill_buffer(). This happens if 384 1. we've made index_reader->get_next() call which returned EOF 385 2. we haven't made any index_reader calls (and our first call should 386 be index_reader->refill_buffer(initial=TRUE) 387 */ 388 bool index_reader_needs_refill; 389 390 /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ 391 bool is_mrr_assoc; 392 393 /* 394 When reading from ordered rowid buffer: the rowid element of the last 395 buffer element that has rowid identical to this one. 396 */ 397 uchar *last_identical_rowid; 398 399 /* Buffer to store (rowid, range_id) pairs */ 400 Lifo_buffer *rowid_buffer; 401 402 int refill_from_index_reader(); 403 }; 404 405 406 /* 407 A primitive "factory" of various Mrr_*_reader classes (the point is to 408 get various kinds of readers without having to allocate them on the heap) 409 */ 410 411 class Mrr_reader_factory 412 { 413 public: 414 Mrr_ordered_rndpos_reader ordered_rndpos_reader; 415 Mrr_ordered_index_reader ordered_index_reader; 416 Mrr_simple_index_reader simple_index_reader; 417 }; 418 419 420 #define DSMRR_IMPL_SORT_KEYS HA_MRR_IMPLEMENTATION_FLAG1 421 #define DSMRR_IMPL_SORT_ROWIDS HA_MRR_IMPLEMENTATION_FLAG2 422 423 /* 424 DS-MRR implementation for one table. Create/use one object of this class for 425 each ha_{myisam/innobase/etc} object. That object will be further referred to 426 as "the handler" 427 428 DsMrr_impl supports has the following execution strategies: 429 430 - Bypass DS-MRR, pass all calls to default MRR implementation, which is 431 an MRR-to-non-MRR call converter. 432 - Key-Ordered Retrieval 433 - Rowid-Ordered Retrieval 434 435 DsMrr_impl will use one of the above strategies, or a combination of them, 436 according to the following diagram: 437 438 (mrr function calls) 439 | 440 +----------------->-----------------+ 441 | | 442 ___________v______________ _______________v________________ 443 / default: use lookup keys \ / KEY-ORDERED RETRIEVAL: \ 444 | (or ranges) in whatever | | sort lookup keys and then make | 445 | order they are supplied | | index lookups in index order | 446 \__________________________/ \________________________________/ 447 | | | | | 448 +---<---+ | +--------------->-----------|----+ 449 | | | | 450 | | +---------------+ | 451 | ______v___ ______ | _______________v_______________ 452 | / default: read \ | / ROWID-ORDERED RETRIEVAL: \ 453 | | table records | | | Before reading table records, | 454 v | in random order | v | sort their rowids and then | 455 | \_________________/ | | read them in rowid order | 456 | | | \_______________________________/ 457 | | | | 458 | | | | 459 +-->---+ | +----<------+-----------<--------+ 460 | | | 461 v v v 462 (table records and range_ids) 463 464 The choice of strategy depends on MRR scan properties, table properties 465 (whether we're scanning clustered primary key), and @@optimizer_switch 466 settings. 467 468 Key-Ordered Retrieval 469 --------------------- 470 The idea is: if MRR scan is essentially a series of lookups on 471 472 tbl.key=value1 OR tbl.key=value2 OR ... OR tbl.key=valueN 473 474 then it makes sense to collect and order the set of lookup values, i.e. 475 476 sort(value1, value2, .. valueN) 477 478 and then do index lookups in index order. This results in fewer index page 479 fetch operations, and we also can avoid making multiple index lookups for the 480 same value. That is, if value1=valueN we can easily discover that after 481 sorting and make one index lookup for them instead of two. 482 483 Rowid-Ordered Retrieval 484 ----------------------- 485 If we do a regular index scan or a series of index lookups, we'll be hitting 486 table records at random. For disk-based engines, this is much slower than 487 reading the same records in disk order. We assume that disk ordering of 488 rows is the same as ordering of their rowids (which is provided by 489 handler::cmp_ref()) 490 In order to retrieve records in different order, we must separate index 491 scanning and record fetching, that is, MRR scan uses the following steps: 492 493 1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and 494 fill a buffer with {rowid, range_id} pairs 495 2. Sort the buffer by rowid value 496 3. for each {rowid, range_id} pair in the buffer 497 get record by rowid and return the {record, range_id} pair 498 4. Repeat the above steps until we've exhausted the list of ranges we're 499 scanning. 500 501 Buffer space management considerations 502 -------------------------------------- 503 With regards to buffer/memory management, MRR interface specifies that 504 - SQL layer provides multi_range_read_init() with buffer of certain size. 505 - MRR implementation may use (i.e. have at its disposal till the end of 506 the MRR scan) all of the buffer, or return the unused end of the buffer 507 to SQL layer. 508 509 DS-MRR needs buffer in order to accumulate and sort rowids and/or keys. When 510 we need to accumulate/sort only keys (or only rowids), it is fairly trivial. 511 512 When we need to accumulate/sort both keys and rowids, efficient buffer use 513 gets complicated. We need to: 514 - First, accumulate keys and sort them 515 - Then use the keys (smaller values go first) to obtain rowids. A key is not 516 needed after we've got matching rowids for it. 517 - Make sure that rowids are accumulated at the front of the buffer, so that we 518 can return the end part of the buffer to SQL layer, should there be too 519 few rowid values to occupy the buffer. 520 521 All of these goals are achieved by using the following scheme: 522 523 | | We get an empty buffer from SQL layer. 524 525 | *-| 526 | *----| First, we fill the buffer with keys. Key_buffer 527 | *-------| part grows from end of the buffer space to start 528 | *----------| (In this picture, the buffer is big enough to 529 | *-------------| accomodate all keys and even have some space left) 530 531 | *=============| We want to do key-ordered index scan, so we sort 532 the keys 533 534 |-x *===========| Then we use the keys get rowids. Rowids are 535 |----x *========| stored from start of buffer space towards the end. 536 |--------x *=====| The part of the buffer occupied with keys 537 |------------x *===| gradually frees up space for rowids. In this 538 |--------------x *=| picture we run out of keys before we've ran out 539 |----------------x | of buffer space (it can be other way as well). 540 541 |================x | Then we sort the rowids. 542 543 | |~~~| The unused part of the buffer is at the end, so 544 we can return it to the SQL layer. 545 546 |================* Sorted rowids are then used to read table records 547 in disk order 548 549 */ 550 551 class DsMrr_impl 552 { 553 public: 554 typedef void (handler::*range_check_toggle_func_t)(bool on); 555 DsMrr_impl()556 DsMrr_impl() 557 : secondary_file(NULL) {}; 558 init(handler * h_arg,TABLE * table_arg)559 void init(handler *h_arg, TABLE *table_arg) 560 { 561 primary_file= h_arg; 562 table= table_arg; 563 } 564 int dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 565 void *seq_init_param, uint n_ranges, uint mode, 566 HANDLER_BUFFER *buf); 567 void dsmrr_close(); 568 int dsmrr_next(range_id_t *range_info); 569 570 ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, 571 uint *bufsz, uint *flags, Cost_estimate *cost); 572 573 ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, 574 void *seq_init_param, uint n_ranges, uint *bufsz, 575 uint *flags, Cost_estimate *cost); 576 577 int dsmrr_explain_info(uint mrr_mode, char *str, size_t size); 578 private: 579 /* Buffer to store (key, range_id) pairs */ 580 Lifo_buffer *key_buffer; 581 582 /* 583 The "owner" handler object (the one that is expected to "own" this object 584 and call its functions). 585 */ 586 handler *primary_file; 587 TABLE *table; /* Always equal to primary_file->table */ 588 589 /* 590 Secondary handler object. (created when needed, we need it when we need 591 to run both index scan and rnd_pos() scan at the same time) 592 */ 593 handler *secondary_file; 594 595 uint keyno; /* index we're running the scan on */ 596 /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ 597 bool is_mrr_assoc; 598 599 Mrr_reader_factory reader_factory; 600 601 Mrr_reader *strategy; 602 bool strategy_exhausted; 603 604 Mrr_index_reader *index_strategy; 605 606 /* The whole buffer space that we're using */ 607 uchar *full_buf; 608 uchar *full_buf_end; 609 610 /* 611 When using both rowid and key buffers: the boundary between key and rowid 612 parts of the buffer. This is the "original" value, actual memory ranges 613 used by key and rowid parts may be different because of dynamic space 614 reallocation between them. 615 */ 616 uchar *rowid_buffer_end; 617 618 /* 619 One of the following two is used for key buffer: forward is used when 620 we only need key buffer, backward is used when we need both key and rowid 621 buffers. 622 */ 623 Forward_lifo_buffer forward_key_buf; 624 Backward_lifo_buffer backward_key_buf; 625 626 /* 627 Buffer to store (rowid, range_id) pairs, or just rowids if 628 is_mrr_assoc==FALSE 629 */ 630 Forward_lifo_buffer rowid_buffer; 631 632 bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 633 Cost_estimate *cost); 634 bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 635 uint *buffer_size, uint extra_mem_overhead, 636 Cost_estimate *cost); 637 bool check_cpk_scan(THD *thd, TABLE_SHARE *share, uint keyno, uint mrr_flags); 638 639 bool setup_buffer_sharing(uint key_size_in_keybuf, key_part_map key_tuple_map); 640 641 /* Buffer_manager and its member functions */ 642 Buffer_manager buf_manager; 643 static void redistribute_buffer_space(void *dsmrr_arg); 644 static void reset_buffer_sizes(void *dsmrr_arg); 645 static void do_nothing(void *dsmrr_arg); 646 get_key_buffer()647 Lifo_buffer* get_key_buffer() { return key_buffer; } 648 649 friend class Key_value_records_iterator; 650 friend class Mrr_ordered_index_reader; 651 friend class Mrr_ordered_rndpos_reader; 652 653 int setup_two_handlers(); 654 void close_second_handler(); 655 }; 656 657 /** 658 @} (end of group DS-MRR declarations) 659 */ 660 661