1 /*
2    Copyright (c) 2009, 2011, Monty Program Ab
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
16 
17 /**
18   @defgroup DS-MRR declarations
19   @{
20 */
21 
22 /**
23   A Disk-Sweep implementation of MRR Interface (DS-MRR for short)
24 
25   This is a "plugin"(*) for storage engines that allows to
26     1. When doing index scans, read table rows in rowid order;
27     2. when making many index lookups, do them in key order and don't
28        lookup the same key value multiple times;
29     3. Do both #1 and #2, when applicable.
30   These changes are expected to speed up query execution for disk-based
31   storage engines running io-bound loads and "big" queries (ie. queries that
32   do joins and enumerate lots of records).
33 
34   (*) - only conceptually. No dynamic loading or binary compatibility of any
35         kind.
36 
37   General scheme of things:
38 
39       SQL Layer code
40        |   |   |
41        v   v   v
42       -|---|---|---- handler->multi_range_read_XXX() function calls
43        |   |   |
44       _____________________________________
45      / DS-MRR module                       \
46      | (order/de-duplicate lookup keys,    |
47      | scan indexes in key order,          |
48      | order/de-duplicate rowids,          |
49      | retrieve full record reads in rowid |
50      | order)                              |
51      \_____________________________________/
52        |   |   |
53       -|---|---|----- handler->read_range_first()/read_range_next(),
54        |   |   |      handler->index_read(), handler->rnd_pos() calls.
55        |   |   |
56        v   v   v
57       Storage engine internals
58 
59 
60   Currently DS-MRR is used by MyISAM, InnoDB and Maria storage engines.
61   Potentially it can be used with any table handler that has disk-based data
62   storage and has better performance when reading data in rowid order.
63 */
64 
65 #include "sql_lifo_buffer.h"
66 
67 class DsMrr_impl;
68 class Mrr_ordered_index_reader;
69 
70 
71 /* A structure with key parameters that's shared among several classes */
72 class Key_parameters
73 {
74 public:
75   uint         key_tuple_length; /* Length of index lookup tuple, in bytes */
76   key_part_map key_tuple_map;    /* keyparts used in index lookup tuples */
77 
78   /*
79     This is
80       = key_tuple_length   if we copy keys to buffer
81       = sizeof(void*)      if we're using pointers to materialized keys.
82   */
83   uint key_size_in_keybuf;
84 
85   /* TRUE <=> don't copy key values, use pointers to them instead.  */
86   bool use_key_pointers;
87 
88   /* TRUE <=> We can get at most one index tuple for a lookup key */
89   bool index_ranges_unique;
90 };
91 
92 
93 /**
94   A class to enumerate (record, range_id) pairs that match given key value.
95 
96   @note
97 
98   The idea is that we have a Lifo_buffer which holds (key, range_id) pairs
99   ordered by key value. From the front of the buffer we see
100 
101     (key_val1, range_id1), (key_val1, range_id2) ... (key_val2, range_idN)
102 
103   we take the first elements that have the same key value (key_val1 in the
104   example above), and make lookup into the table.  The table will have
105   multiple matches for key_val1:
106 
107                   == Table Index ==
108                    ...
109      key_val1 ->  key_val1, index_tuple1
110                   key_val1, index_tuple2
111                    ...
112                   key_val1, index_tupleN
113                    ...
114 
115   Our goal is to produce all possible combinations, i.e. we need:
116 
117     {(key_val1, index_tuple1), range_id1}
118     {(key_val1, index_tuple1), range_id2}
119        ...           ...               |
120     {(key_val1, index_tuple1), range_idN},
121 
122     {(key_val1, index_tuple2), range_id1}
123     {(key_val1, index_tuple2), range_id2}
124         ...          ...               |
125     {(key_val1, index_tuple2), range_idN},
126 
127         ...          ...          ...
128 
129     {(key_val1, index_tupleK), range_idN}
130 */
131 
132 class Key_value_records_iterator
133 {
134   /* Use this to get table handler, key buffer and other parameters */
135   Mrr_ordered_index_reader *owner;
136 
137   /* Iterator to get (key, range_id) pairs from */
138   Lifo_buffer_iterator identical_key_it;
139 
140   /*
141     Last of the identical key values (when we get this pointer from
142     identical_key_it, it will be time to stop).
143   */
144   uchar *last_identical_key_ptr;
145 
146   /*
147     FALSE <=> we're right after the init() call, the record has been already
148     read with owner->file->index_read_map() call
149   */
150   bool get_next_row;
151 
152 public:
153   int init(Mrr_ordered_index_reader *owner_arg);
154   int get_next(range_id_t *range_info);
155   void move_to_next_key_value();
156 };
157 
158 
159 /*
160   Buffer manager interface. Mrr_reader objects use it to inqure DsMrr_impl
161   to manage buffer space for them.
162 */
163 typedef struct st_buffer_manager
164 {
165 public:
166   /* Opaque value to be passed as the first argument to all member functions */
167   void *arg;
168 
169   /*
170     This is called when we've freed more space from the rowid buffer. The
171     callee will get the unused space from the rowid buffer and give it to the
172     key buffer.
173   */
174   void (*redistribute_buffer_space)(void *arg);
175 
176   /*
177     This is called when both key and rowid buffers are empty, and so it's time
178     to reset them to their original size (They've lost their original size,
179     because we were dynamically growing rowid buffer and shrinking key buffer).
180   */
181   void (*reset_buffer_sizes)(void *arg);
182 
183 } Buffer_manager;
184 
185 
186 /*
187   Mrr_reader - DS-MRR execution strategy abstraction
188 
189   A reader produces ([index]_record, range_info) pairs, and requires periodic
190   refill operations.
191 
192   - one starts using the reader by calling reader->get_next(),
193   - when a get_next() call returns HA_ERR_END_OF_FILE, one must call
194     refill_buffer() before they can make more get_next() calls.
195   - when refill_buffer() returns HA_ERR_END_OF_FILE, this means the real
196     end of stream and get_next() should not be called anymore.
197 
198   Both functions can return other error codes, these mean unrecoverable errors
199   after which one cannot continue.
200 */
201 
202 class Mrr_reader
203 {
204 public:
205   virtual int get_next(range_id_t *range_info) = 0;
206   virtual int refill_buffer(bool initial) = 0;
~Mrr_reader()207   virtual ~Mrr_reader() {}; /* just to remove compiler warning */
208 };
209 
210 
211 /*
212   A common base for readers that do index scans and produce index tuples
213 */
214 
215 class Mrr_index_reader : public Mrr_reader
216 {
217 protected:
218   handler *file; /* Handler object to use */
219 public:
220   virtual int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
221                    void *seq_init_param, uint n_ranges,
222                    uint mode, Key_parameters *key_par,
223                    Lifo_buffer *key_buffer,
224                    Buffer_manager *buf_manager_arg) = 0;
225 
226   /* Get pointer to place where every get_next() call will put rowid */
227   virtual uchar *get_rowid_ptr() = 0;
228   /* Get the rowid (call this after get_next() call) */
229   virtual void position();
230   virtual bool skip_record(range_id_t range_id, uchar *rowid) = 0;
231 
interrupt_read()232   virtual void interrupt_read() {}
resume_read()233   virtual void resume_read() {}
234 };
235 
236 
237 /*
238   A "bypass" index reader that just does and index scan. The index scan is done
239   by calling default MRR implementation (i.e.  handler::multi_range_read_XXX())
240   functions.
241 */
242 
243 class Mrr_simple_index_reader : public Mrr_index_reader
244 {
245 public:
246   int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
247            void *seq_init_param, uint n_ranges,
248            uint mode, Key_parameters *key_par,
249            Lifo_buffer *key_buffer,
250            Buffer_manager *buf_manager_arg);
251   int get_next(range_id_t *range_info);
refill_buffer(bool initial)252   int refill_buffer(bool initial) { return initial? 0: HA_ERR_END_OF_FILE; }
get_rowid_ptr()253   uchar *get_rowid_ptr() { return file->ref; }
skip_record(range_id_t range_id,uchar * rowid)254   bool skip_record(range_id_t range_id, uchar *rowid)
255   {
256     return (file->mrr_funcs.skip_record &&
257             file->mrr_funcs.skip_record(file->mrr_iter, range_id, rowid));
258   }
259 };
260 
261 
262 /*
263   A reader that sorts the key values before it makes the index lookups.
264 */
265 
266 class Mrr_ordered_index_reader : public Mrr_index_reader
267 {
268 public:
269   int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
270            void *seq_init_param, uint n_ranges,
271            uint mode, Key_parameters *key_par,
272            Lifo_buffer *key_buffer,
273            Buffer_manager *buf_manager_arg);
274   int get_next(range_id_t *range_info);
275   int refill_buffer(bool initial);
get_rowid_ptr()276   uchar *get_rowid_ptr() { return file->ref; }
277 
skip_record(range_id_t range_info,uchar * rowid)278   bool skip_record(range_id_t range_info, uchar *rowid)
279   {
280     return (mrr_funcs.skip_record &&
281             mrr_funcs.skip_record(mrr_iter, range_info, rowid));
282   }
283 
skip_index_tuple(range_id_t range_info)284   bool skip_index_tuple(range_id_t range_info)
285   {
286     return (mrr_funcs.skip_index_tuple &&
287             mrr_funcs.skip_index_tuple(mrr_iter, range_info));
288   }
289 
290   bool set_interruption_temp_buffer(uint rowid_length, uint key_len,
291                                     uint saved_pk_len,
292                                     uchar **space_start, uchar *space_end);
293   void set_no_interruption_temp_buffer();
294 
295   void interrupt_read();
296   void resume_read();
297   void position();
298 private:
299   Key_value_records_iterator kv_it;
300 
301   bool scanning_key_val_iter;
302 
303   /* Buffer to store (key, range_id) pairs */
304   Lifo_buffer *key_buffer;
305 
306   /* This manages key buffer allocation and sizing for us */
307   Buffer_manager *buf_manager;
308 
309   Key_parameters  keypar; /* index scan and lookup tuple parameters */
310 
311   /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
312   bool is_mrr_assoc;
313 
314   /* Range sequence iteration members */
315   RANGE_SEQ_IF mrr_funcs;
316   range_seq_t mrr_iter;
317 
318   /* TRUE == reached eof when enumerating ranges */
319   bool source_exhausted;
320 
321   /*
322     Following members are for interrupt_read()/resume_read(). The idea is that
323     in some cases index scan that is done by this object is interrupted by
324     rnd_pos() calls made by Mrr_ordered_rndpos_reader. The problem is that
325     we're sharing handler->record[0] with that object, and it destroys its
326     contents.
327     We need to save/restore our current
328     - index tuple (for pushed index condition checks)
329     - clustered primary key values (again, for pushed index condition checks)
330     - rowid of the last record we've retrieved (in case this rowid matches
331       multiple ranges and we'll need to return it again)
332   */
333   bool support_scan_interruptions;
334   /* Space where we save the rowid of the last record we've returned */
335   uchar *saved_rowid;
336 
337   /* TRUE <=> saved_rowid has the last saved rowid */
338   bool have_saved_rowid;
339 
340   uchar *saved_key_tuple; /* Saved current key tuple */
341   uchar *saved_primary_key; /* Saved current primary key tuple */
342 
343   /*
344     TRUE<=> saved_key_tuple (and saved_primary_key when applicable) have
345     valid values.
346   */
347   bool read_was_interrupted;
348 
349   static int compare_keys(void* arg, uchar* key1, uchar* key2);
350   static int compare_keys_reverse(void* arg, uchar* key1, uchar* key2);
351 
352   friend class Key_value_records_iterator;
353   friend class DsMrr_impl;
354   friend class Mrr_ordered_rndpos_reader;
355 };
356 
357 
358 /*
359   A reader that gets rowids from an Mrr_index_reader, and then sorts them
360   before getting full records with handler->rndpos() calls.
361 */
362 
363 class Mrr_ordered_rndpos_reader : public Mrr_reader
364 {
365 public:
366   int init(handler *file, Mrr_index_reader *index_reader, uint mode,
367            Lifo_buffer *buf);
368   int get_next(range_id_t *range_info);
369   int refill_buffer(bool initial);
370 private:
371   handler *file; /* Handler to use */
372 
373   /* This what we get (rowid, range_info) pairs from */
374   Mrr_index_reader *index_reader;
375 
376   /* index_reader->get_next() puts rowid here */
377   uchar *index_rowid;
378 
379   /* TRUE <=> index_reader->refill_buffer() call has returned EOF */
380   bool index_reader_exhausted;
381 
382   /*
383     TRUE <=> We should call index_reader->refill_buffer(). This happens if
384     1. we've made index_reader->get_next() call which returned EOF
385     2. we haven't made any index_reader calls (and our first call should
386        be index_reader->refill_buffer(initial=TRUE)
387   */
388   bool index_reader_needs_refill;
389 
390   /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
391   bool is_mrr_assoc;
392 
393   /*
394     When reading from ordered rowid buffer: the rowid element of the last
395     buffer element that has rowid identical to this one.
396   */
397   uchar *last_identical_rowid;
398 
399   /* Buffer to store (rowid, range_id) pairs */
400   Lifo_buffer *rowid_buffer;
401 
402   int refill_from_index_reader();
403 };
404 
405 
406 /*
407   A primitive "factory" of various Mrr_*_reader classes (the point is to
408   get various kinds of readers without having to allocate them on the heap)
409 */
410 
411 class Mrr_reader_factory
412 {
413 public:
414   Mrr_ordered_rndpos_reader ordered_rndpos_reader;
415   Mrr_ordered_index_reader  ordered_index_reader;
416   Mrr_simple_index_reader   simple_index_reader;
417 };
418 
419 
420 #define DSMRR_IMPL_SORT_KEYS   HA_MRR_IMPLEMENTATION_FLAG1
421 #define DSMRR_IMPL_SORT_ROWIDS HA_MRR_IMPLEMENTATION_FLAG2
422 
423 /*
424   DS-MRR implementation for one table. Create/use one object of this class for
425   each ha_{myisam/innobase/etc} object. That object will be further referred to
426   as "the handler"
427 
428   DsMrr_impl supports has the following execution strategies:
429 
430   - Bypass DS-MRR, pass all calls to default MRR implementation, which is
431     an MRR-to-non-MRR call converter.
432   - Key-Ordered Retrieval
433   - Rowid-Ordered Retrieval
434 
435   DsMrr_impl will use one of the above strategies, or a combination of them,
436   according to the following diagram:
437 
438          (mrr function calls)
439                 |
440                 +----------------->-----------------+
441                 |                                   |
442      ___________v______________      _______________v________________
443     / default: use lookup keys \    / KEY-ORDERED RETRIEVAL:         \
444     | (or ranges) in whatever  |    | sort lookup keys and then make |
445     | order they are supplied  |    | index lookups in index order   |
446     \__________________________/    \________________________________/
447               | |  |                           |    |
448       +---<---+ |  +--------------->-----------|----+
449       |         |                              |    |
450       |         |              +---------------+    |
451       |   ______v___ ______    |     _______________v_______________
452       |  / default: read   \   |    / ROWID-ORDERED RETRIEVAL:      \
453       |  | table records   |   |    | Before reading table records, |
454       v  | in random order |   v    | sort their rowids and then    |
455       |  \_________________/   |    | read them in rowid order      |
456       |         |              |    \_______________________________/
457       |         |              |                    |
458       |         |              |                    |
459       +-->---+  |  +----<------+-----------<--------+
460              |  |  |
461              v  v  v
462       (table records and range_ids)
463 
464   The choice of strategy depends on MRR scan properties, table properties
465   (whether we're scanning clustered primary key), and @@optimizer_switch
466   settings.
467 
468   Key-Ordered Retrieval
469   ---------------------
470   The idea is: if MRR scan is essentially a series of lookups on
471 
472     tbl.key=value1 OR tbl.key=value2 OR ... OR tbl.key=valueN
473 
474   then it makes sense to collect and order the set of lookup values, i.e.
475 
476      sort(value1, value2, .. valueN)
477 
478   and then do index lookups in index order. This results in fewer index page
479   fetch operations, and we also can avoid making multiple index lookups for the
480   same value. That is, if value1=valueN we can easily discover that after
481   sorting and make one index lookup for them instead of two.
482 
483   Rowid-Ordered Retrieval
484   -----------------------
485   If we do a regular index scan or a series of index lookups, we'll be hitting
486   table records at random. For disk-based engines, this is much slower than
487   reading the same records in disk order. We assume that disk ordering of
488   rows is the same as ordering of their rowids (which is provided by
489   handler::cmp_ref())
490   In order to retrieve records in different order, we must separate index
491   scanning and record fetching, that is, MRR scan uses the following steps:
492 
493     1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and
494         fill a buffer with {rowid, range_id} pairs
495     2. Sort the buffer by rowid value
496     3. for each {rowid, range_id} pair in the buffer
497          get record by rowid and return the {record, range_id} pair
498     4. Repeat the above steps until we've exhausted the list of ranges we're
499        scanning.
500 
501   Buffer space management considerations
502   --------------------------------------
503   With regards to buffer/memory management, MRR interface specifies that
504    - SQL layer provides multi_range_read_init() with buffer of certain size.
505    - MRR implementation may use (i.e. have at its disposal till the end of
506      the MRR scan) all of the buffer, or return the unused end of the buffer
507      to SQL layer.
508 
509   DS-MRR needs buffer in order to accumulate and sort rowids and/or keys. When
510   we need to accumulate/sort only keys (or only rowids), it is fairly trivial.
511 
512   When we need to accumulate/sort both keys and rowids, efficient buffer use
513   gets complicated. We need to:
514    - First, accumulate keys and sort them
515    - Then use the keys (smaller values go first) to obtain rowids. A key is not
516      needed after we've got matching rowids for it.
517    - Make sure that rowids are accumulated at the front of the buffer, so that we
518      can return the end part of the buffer to SQL layer, should there be too
519      few rowid values to occupy the buffer.
520 
521   All of these goals are achieved by using the following scheme:
522 
523      |                    |   We get an empty buffer from SQL layer.
524 
525      |                  *-|
526      |               *----|   First, we fill the buffer with keys. Key_buffer
527      |            *-------|   part grows from end of the buffer space to start
528      |         *----------|   (In this picture, the buffer is big enough to
529      |      *-------------|    accomodate all keys and even have some space left)
530 
531      |      *=============|   We want to do key-ordered index scan, so we sort
532                               the keys
533 
534      |-x      *===========|   Then we use the keys get rowids. Rowids are
535      |----x      *========|   stored from start of buffer space towards the end.
536      |--------x     *=====|   The part of the buffer occupied with keys
537      |------------x   *===|   gradually frees up space for rowids. In this
538      |--------------x   *=|   picture we run out of keys before we've ran out
539      |----------------x   |   of buffer space (it can be other way as well).
540 
541      |================x   |   Then we sort the rowids.
542 
543      |                |~~~|   The unused part of the buffer is at the end, so
544                               we can return it to the SQL layer.
545 
546      |================*       Sorted rowids are then used to read table records
547                               in disk order
548 
549 */
550 
551 class DsMrr_impl
552 {
553 public:
554   typedef void (handler::*range_check_toggle_func_t)(bool on);
555 
DsMrr_impl()556   DsMrr_impl()
557     : secondary_file(NULL) {};
558 
init(handler * h_arg,TABLE * table_arg)559   void init(handler *h_arg, TABLE *table_arg)
560   {
561     primary_file= h_arg;
562     table= table_arg;
563   }
564   int dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
565                  void *seq_init_param, uint n_ranges, uint mode,
566                  HANDLER_BUFFER *buf);
567   void dsmrr_close();
568   int dsmrr_next(range_id_t *range_info);
569 
570   ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts,
571                      uint *bufsz, uint *flags, Cost_estimate *cost);
572 
573   ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
574                             void *seq_init_param, uint n_ranges, uint *bufsz,
575                             uint *flags, Cost_estimate *cost);
576 
577   int dsmrr_explain_info(uint mrr_mode, char *str, size_t size);
578 private:
579   /* Buffer to store (key, range_id) pairs */
580   Lifo_buffer *key_buffer;
581 
582   /*
583     The "owner" handler object (the one that is expected to "own" this object
584     and call its functions).
585   */
586   handler *primary_file;
587   TABLE *table; /* Always equal to primary_file->table */
588 
589   /*
590     Secondary handler object. (created when needed, we need it when we need
591     to run both index scan and rnd_pos() scan at the same time)
592   */
593   handler *secondary_file;
594 
595   uint keyno; /* index we're running the scan on */
596   /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
597   bool is_mrr_assoc;
598 
599   Mrr_reader_factory reader_factory;
600 
601   Mrr_reader *strategy;
602   bool strategy_exhausted;
603 
604   Mrr_index_reader *index_strategy;
605 
606   /* The whole buffer space that we're using */
607   uchar *full_buf;
608   uchar *full_buf_end;
609 
610   /*
611     When using both rowid and key buffers: the boundary between key and rowid
612     parts of the buffer. This is the "original" value, actual memory ranges
613     used by key and rowid parts may be different because of dynamic space
614     reallocation between them.
615   */
616   uchar *rowid_buffer_end;
617 
618   /*
619     One of the following two is used for key buffer: forward is used when
620     we only need key buffer, backward is used when we need both key and rowid
621     buffers.
622   */
623   Forward_lifo_buffer forward_key_buf;
624   Backward_lifo_buffer backward_key_buf;
625 
626   /*
627     Buffer to store (rowid, range_id) pairs, or just rowids if
628     is_mrr_assoc==FALSE
629   */
630   Forward_lifo_buffer rowid_buffer;
631 
632   bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz,
633                        Cost_estimate *cost);
634   bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
635                                uint *buffer_size, uint extra_mem_overhead,
636                                Cost_estimate *cost);
637   bool check_cpk_scan(THD *thd, TABLE_SHARE *share, uint keyno, uint mrr_flags);
638 
639   bool setup_buffer_sharing(uint key_size_in_keybuf, key_part_map key_tuple_map);
640 
641   /* Buffer_manager and its member functions */
642   Buffer_manager buf_manager;
643   static void redistribute_buffer_space(void *dsmrr_arg);
644   static void reset_buffer_sizes(void *dsmrr_arg);
645   static void do_nothing(void *dsmrr_arg);
646 
get_key_buffer()647   Lifo_buffer* get_key_buffer() { return key_buffer; }
648 
649   friend class Key_value_records_iterator;
650   friend class Mrr_ordered_index_reader;
651   friend class Mrr_ordered_rndpos_reader;
652 
653   int  setup_two_handlers();
654   void close_second_handler();
655 };
656 
657 /**
658   @} (end of group DS-MRR declarations)
659 */
660 
661