1 /*
2    Copyright (c) 2009, 2011, Monty Program Ab
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
16 
17 /**
18   @defgroup DS-MRR declarations
19   @{
20 */
21 
22 /**
23   A Disk-Sweep implementation of MRR Interface (DS-MRR for short)
24 
25   This is a "plugin"(*) for storage engines that allows to
26     1. When doing index scans, read table rows in rowid order;
27     2. when making many index lookups, do them in key order and don't
28        lookup the same key value multiple times;
29     3. Do both #1 and #2, when applicable.
30   These changes are expected to speed up query execution for disk-based
31   storage engines running io-bound loads and "big" queries (ie. queries that
32   do joins and enumerate lots of records).
33 
34   (*) - only conceptually. No dynamic loading or binary compatibility of any
35         kind.
36 
37   General scheme of things:
38 
39       SQL Layer code
40        |   |   |
41        v   v   v
42       -|---|---|---- handler->multi_range_read_XXX() function calls
43        |   |   |
44       _____________________________________
45      / DS-MRR module                       \
46      | (order/de-duplicate lookup keys,    |
47      | scan indexes in key order,          |
48      | order/de-duplicate rowids,          |
49      | retrieve full record reads in rowid |
50      | order)                              |
51      \_____________________________________/
52        |   |   |
53       -|---|---|----- handler->read_range_first()/read_range_next(),
54        |   |   |      handler->index_read(), handler->rnd_pos() calls.
55        |   |   |
56        v   v   v
57       Storage engine internals
58 
59 
60   Currently DS-MRR is used by MyISAM, InnoDB and Maria storage engines.
61   Potentially it can be used with any table handler that has disk-based data
62   storage and has better performance when reading data in rowid order.
63 */
64 
65 #include "sql_lifo_buffer.h"
66 
67 class DsMrr_impl;
68 class Mrr_ordered_index_reader;
69 
70 
71 /* A structure with key parameters that's shared among several classes */
72 class Key_parameters
73 {
74 public:
75   uint         key_tuple_length; /* Length of index lookup tuple, in bytes */
76   key_part_map key_tuple_map;    /* keyparts used in index lookup tuples */
77 
78   /*
79     This is
80       = key_tuple_length   if we copy keys to buffer
81       = sizeof(void*)      if we're using pointers to materialized keys.
82   */
83   uint key_size_in_keybuf;
84 
85   /* TRUE <=> don't copy key values, use pointers to them instead.  */
86   bool use_key_pointers;
87 
88   /* TRUE <=> We can get at most one index tuple for a lookup key */
89   bool index_ranges_unique;
90 };
91 
92 
93 /**
94   A class to enumerate (record, range_id) pairs that match given key value.
95 
96   @note
97 
98   The idea is that we have a Lifo_buffer which holds (key, range_id) pairs
99   ordered by key value. From the front of the buffer we see
100 
101     (key_val1, range_id1), (key_val1, range_id2) ... (key_val2, range_idN)
102 
103   we take the first elements that have the same key value (key_val1 in the
104   example above), and make lookup into the table.  The table will have
105   multiple matches for key_val1:
106 
107                   == Table Index ==
108                    ...
109      key_val1 ->  key_val1, index_tuple1
110                   key_val1, index_tuple2
111                    ...
112                   key_val1, index_tupleN
113                    ...
114 
115   Our goal is to produce all possible combinations, i.e. we need:
116 
117     {(key_val1, index_tuple1), range_id1}
118     {(key_val1, index_tuple1), range_id2}
119        ...           ...               |
120     {(key_val1, index_tuple1), range_idN},
121 
122     {(key_val1, index_tuple2), range_id1}
123     {(key_val1, index_tuple2), range_id2}
124         ...          ...               |
125     {(key_val1, index_tuple2), range_idN},
126 
127         ...          ...          ...
128 
129     {(key_val1, index_tupleK), range_idN}
130 */
131 
132 class Key_value_records_iterator
133 {
134   /* Use this to get table handler, key buffer and other parameters */
135   Mrr_ordered_index_reader *owner;
136 
137   /* Iterator to get (key, range_id) pairs from */
138   Lifo_buffer_iterator identical_key_it;
139 
140   /*
141     Last of the identical key values (when we get this pointer from
142     identical_key_it, it will be time to stop).
143   */
144   uchar *last_identical_key_ptr;
145 
146   /*
147     FALSE <=> we're right after the init() call, the record has been already
148     read with owner->file->index_read_map() call
149   */
150   bool get_next_row;
151 
152 public:
153   int init(Mrr_ordered_index_reader *owner_arg);
154   int get_next(range_id_t *range_info);
155   void move_to_next_key_value();
156 };
157 
158 
159 /*
160   Buffer manager interface. Mrr_reader objects use it to inqure DsMrr_impl
161   to manage buffer space for them.
162 */
163 typedef struct st_buffer_manager
164 {
165 public:
166   /* Opaque value to be passed as the first argument to all member functions */
167   void *arg;
168 
169   /*
170     This is called when we've freed more space from the rowid buffer. The
171     callee will get the unused space from the rowid buffer and give it to the
172     key buffer.
173   */
174   void (*redistribute_buffer_space)(void *arg);
175 
176   /*
177     This is called when both key and rowid buffers are empty, and so it's time
178     to reset them to their original size (They've lost their original size,
179     because we were dynamically growing rowid buffer and shrinking key buffer).
180   */
181   void (*reset_buffer_sizes)(void *arg);
182 
183 } Buffer_manager;
184 
185 
186 /*
187   Mrr_reader - DS-MRR execution strategy abstraction
188 
189   A reader produces ([index]_record, range_info) pairs, and requires periodic
190   refill operations.
191 
192   - one starts using the reader by calling reader->get_next(),
193   - when a get_next() call returns HA_ERR_END_OF_FILE, one must call
194     refill_buffer() before they can make more get_next() calls.
195   - when refill_buffer() returns HA_ERR_END_OF_FILE, this means the real
196     end of stream and get_next() should not be called anymore.
197 
198   Both functions can return other error codes, these mean unrecoverable errors
199   after which one cannot continue.
200 */
201 
202 class Mrr_reader
203 {
204 public:
205   virtual int get_next(range_id_t *range_info) = 0;
206   virtual int refill_buffer(bool initial) = 0;
~Mrr_reader()207   virtual ~Mrr_reader() {}; /* just to remove compiler warning */
208 };
209 
210 
211 /*
212   A common base for readers that do index scans and produce index tuples
213 */
214 
215 class Mrr_index_reader : public Mrr_reader
216 {
217 protected:
218   handler *file; /* Handler object to use */
219 public:
220   virtual int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
221                    void *seq_init_param, uint n_ranges,
222                    uint mode, Key_parameters *key_par,
223                    Lifo_buffer *key_buffer,
224                    Buffer_manager *buf_manager_arg) = 0;
225 
226   /* Get pointer to place where every get_next() call will put rowid */
227   virtual uchar *get_rowid_ptr() = 0;
228   /* Get the rowid (call this after get_next() call) */
229   virtual void position();
230   virtual bool skip_record(range_id_t range_id, uchar *rowid) = 0;
231 
interrupt_read()232   virtual void interrupt_read() {}
resume_read()233   virtual void resume_read() {}
234 };
235 
236 
237 /*
238   A "bypass" index reader that just does and index scan. The index scan is done
239   by calling default MRR implementation (i.e.  handler::multi_range_read_XXX())
240   functions.
241 */
242 
243 class Mrr_simple_index_reader : public Mrr_index_reader
244 {
245 public:
246   int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
247            void *seq_init_param, uint n_ranges,
248            uint mode, Key_parameters *key_par,
249            Lifo_buffer *key_buffer,
250            Buffer_manager *buf_manager_arg);
251   int get_next(range_id_t *range_info);
refill_buffer(bool initial)252   int refill_buffer(bool initial) { return initial? 0: HA_ERR_END_OF_FILE; }
get_rowid_ptr()253   uchar *get_rowid_ptr() { return file->ref; }
skip_record(range_id_t range_id,uchar * rowid)254   bool skip_record(range_id_t range_id, uchar *rowid)
255   {
256     return (file->mrr_funcs.skip_record &&
257             file->mrr_funcs.skip_record(file->mrr_iter, range_id, rowid));
258   }
259 };
260 
261 
262 /*
263   A reader that sorts the key values before it makes the index lookups.
264 */
265 
266 class Mrr_ordered_index_reader : public Mrr_index_reader
267 {
268 public:
269   int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
270            void *seq_init_param, uint n_ranges,
271            uint mode, Key_parameters *key_par,
272            Lifo_buffer *key_buffer,
273            Buffer_manager *buf_manager_arg);
274   int get_next(range_id_t *range_info);
275   int refill_buffer(bool initial);
get_rowid_ptr()276   uchar *get_rowid_ptr() { return file->ref; }
277 
skip_record(range_id_t range_info,uchar * rowid)278   bool skip_record(range_id_t range_info, uchar *rowid)
279   {
280     return (mrr_funcs.skip_record &&
281             mrr_funcs.skip_record(mrr_iter, range_info, rowid));
282   }
283 
skip_index_tuple(range_id_t range_info)284   bool skip_index_tuple(range_id_t range_info)
285   {
286     return (mrr_funcs.skip_index_tuple &&
287             mrr_funcs.skip_index_tuple(mrr_iter, range_info));
288   }
289 
290   bool set_interruption_temp_buffer(uint rowid_length, uint key_len,
291                                     uint saved_pk_len,
292                                     uchar **space_start, uchar *space_end);
293   void set_no_interruption_temp_buffer();
294 
295   void interrupt_read();
296   void resume_read();
297   void position();
298 private:
299   Key_value_records_iterator kv_it;
300 
301   bool scanning_key_val_iter;
302 
303   /* Buffer to store (key, range_id) pairs */
304   Lifo_buffer *key_buffer;
305 
306   /* This manages key buffer allocation and sizing for us */
307   Buffer_manager *buf_manager;
308 
309   Key_parameters  keypar; /* index scan and lookup tuple parameters */
310 
311   /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
312   bool is_mrr_assoc;
313 
314   /* Range sequence iteration members */
315   RANGE_SEQ_IF mrr_funcs;
316   range_seq_t mrr_iter;
317 
318   /* TRUE == reached eof when enumerating ranges */
319   bool source_exhausted;
320 
321   /*
322     Following members are for interrupt_read()/resume_read(). The idea is that
323     in some cases index scan that is done by this object is interrupted by
324     rnd_pos() calls made by Mrr_ordered_rndpos_reader. The problem is that
325     we're sharing handler->record[0] with that object, and it destroys its
326     contents.
327     We need to save/restore our current
328     - index tuple (for pushed index condition checks)
329     - clustered primary key values (again, for pushed index condition checks)
330     - rowid of the last record we've retrieved (in case this rowid matches
331       multiple ranges and we'll need to return it again)
332   */
333   bool support_scan_interruptions;
334   /* Space where we save the rowid of the last record we've returned */
335   uchar *saved_rowid;
336 
337   /* TRUE <=> saved_rowid has the last saved rowid */
338   bool have_saved_rowid;
339 
340   uchar *saved_key_tuple; /* Saved current key tuple */
341   uchar *saved_primary_key; /* Saved current primary key tuple */
342 
343   /*
344     TRUE<=> saved_key_tuple (and saved_primary_key when applicable) have
345     valid values.
346   */
347   bool read_was_interrupted;
348 
349   static int compare_keys(void* arg, uchar* key1, uchar* key2);
350   static int compare_keys_reverse(void* arg, uchar* key1, uchar* key2);
351 
352   friend class Key_value_records_iterator;
353   friend class DsMrr_impl;
354   friend class Mrr_ordered_rndpos_reader;
355 };
356 
357 
358 /*
359   A reader that gets rowids from an Mrr_index_reader, and then sorts them
360   before getting full records with handler->rndpos() calls.
361 */
362 
363 class Mrr_ordered_rndpos_reader : public Mrr_reader
364 {
365 public:
366   int init(handler *file, Mrr_index_reader *index_reader, uint mode,
367            Lifo_buffer *buf, Rowid_filter *filter);
368   int get_next(range_id_t *range_info);
369   int refill_buffer(bool initial);
370 private:
371   handler *file; /* Handler to use */
372 
373   /* This what we get (rowid, range_info) pairs from */
374   Mrr_index_reader *index_reader;
375 
376   /* index_reader->get_next() puts rowid here */
377   uchar *index_rowid;
378 
379   /* TRUE <=> index_reader->refill_buffer() call has returned EOF */
380   bool index_reader_exhausted;
381 
382   /*
383     TRUE <=> We should call index_reader->refill_buffer(). This happens if
384     1. we've made index_reader->get_next() call which returned EOF
385     2. we haven't made any index_reader calls (and our first call should
386        be index_reader->refill_buffer(initial=TRUE)
387   */
388   bool index_reader_needs_refill;
389 
390   /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
391   bool is_mrr_assoc;
392 
393   /*
394     When reading from ordered rowid buffer: the rowid element of the last
395     buffer element that has rowid identical to this one.
396   */
397   uchar *last_identical_rowid;
398 
399   /* Buffer to store (rowid, range_id) pairs */
400   Lifo_buffer *rowid_buffer;
401 
402   /* Rowid filter to be checked against (if any) */
403   Rowid_filter *rowid_filter;
404 
405   int refill_from_index_reader();
406 };
407 
408 
409 /*
410   A primitive "factory" of various Mrr_*_reader classes (the point is to
411   get various kinds of readers without having to allocate them on the heap)
412 */
413 
414 class Mrr_reader_factory
415 {
416 public:
417   Mrr_ordered_rndpos_reader ordered_rndpos_reader;
418   Mrr_ordered_index_reader  ordered_index_reader;
419   Mrr_simple_index_reader   simple_index_reader;
420 };
421 
422 
423 #define DSMRR_IMPL_SORT_KEYS   HA_MRR_IMPLEMENTATION_FLAG1
424 #define DSMRR_IMPL_SORT_ROWIDS HA_MRR_IMPLEMENTATION_FLAG2
425 
426 /*
427   DS-MRR implementation for one table. Create/use one object of this class for
428   each ha_{myisam/innobase/etc} object. That object will be further referred to
429   as "the handler"
430 
431   DsMrr_impl supports has the following execution strategies:
432 
433   - Bypass DS-MRR, pass all calls to default MRR implementation, which is
434     an MRR-to-non-MRR call converter.
435   - Key-Ordered Retrieval
436   - Rowid-Ordered Retrieval
437 
438   DsMrr_impl will use one of the above strategies, or a combination of them,
439   according to the following diagram:
440 
441          (mrr function calls)
442                 |
443                 +----------------->-----------------+
444                 |                                   |
445      ___________v______________      _______________v________________
446     / default: use lookup keys \    / KEY-ORDERED RETRIEVAL:         \
447     | (or ranges) in whatever  |    | sort lookup keys and then make |
448     | order they are supplied  |    | index lookups in index order   |
449     \__________________________/    \________________________________/
450               | |  |                           |    |
451       +---<---+ |  +--------------->-----------|----+
452       |         |                              |    |
453       |         |              +---------------+    |
454       |   ______v___ ______    |     _______________v_______________
455       |  / default: read   \   |    / ROWID-ORDERED RETRIEVAL:      \
456       |  | table records   |   |    | Before reading table records, |
457       v  | in random order |   v    | sort their rowids and then    |
458       |  \_________________/   |    | read them in rowid order      |
459       |         |              |    \_______________________________/
460       |         |              |                    |
461       |         |              |                    |
462       +-->---+  |  +----<------+-----------<--------+
463              |  |  |
464              v  v  v
465       (table records and range_ids)
466 
467   The choice of strategy depends on MRR scan properties, table properties
468   (whether we're scanning clustered primary key), and @@optimizer_switch
469   settings.
470 
471   Key-Ordered Retrieval
472   ---------------------
473   The idea is: if MRR scan is essentially a series of lookups on
474 
475     tbl.key=value1 OR tbl.key=value2 OR ... OR tbl.key=valueN
476 
477   then it makes sense to collect and order the set of lookup values, i.e.
478 
479      sort(value1, value2, .. valueN)
480 
481   and then do index lookups in index order. This results in fewer index page
482   fetch operations, and we also can avoid making multiple index lookups for the
483   same value. That is, if value1=valueN we can easily discover that after
484   sorting and make one index lookup for them instead of two.
485 
486   Rowid-Ordered Retrieval
487   -----------------------
488   If we do a regular index scan or a series of index lookups, we'll be hitting
489   table records at random. For disk-based engines, this is much slower than
490   reading the same records in disk order. We assume that disk ordering of
491   rows is the same as ordering of their rowids (which is provided by
492   handler::cmp_ref())
493   In order to retrieve records in different order, we must separate index
494   scanning and record fetching, that is, MRR scan uses the following steps:
495 
496     1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and
497         fill a buffer with {rowid, range_id} pairs
498     2. Sort the buffer by rowid value
499     3. for each {rowid, range_id} pair in the buffer
500          get record by rowid and return the {record, range_id} pair
501     4. Repeat the above steps until we've exhausted the list of ranges we're
502        scanning.
503 
504   Buffer space management considerations
505   --------------------------------------
506   With regards to buffer/memory management, MRR interface specifies that
507    - SQL layer provides multi_range_read_init() with buffer of certain size.
508    - MRR implementation may use (i.e. have at its disposal till the end of
509      the MRR scan) all of the buffer, or return the unused end of the buffer
510      to SQL layer.
511 
512   DS-MRR needs buffer in order to accumulate and sort rowids and/or keys. When
513   we need to accumulate/sort only keys (or only rowids), it is fairly trivial.
514 
515   When we need to accumulate/sort both keys and rowids, efficient buffer use
516   gets complicated. We need to:
517    - First, accumulate keys and sort them
518    - Then use the keys (smaller values go first) to obtain rowids. A key is not
519      needed after we've got matching rowids for it.
520    - Make sure that rowids are accumulated at the front of the buffer, so that we
521      can return the end part of the buffer to SQL layer, should there be too
522      few rowid values to occupy the buffer.
523 
524   All of these goals are achieved by using the following scheme:
525 
526      |                    |   We get an empty buffer from SQL layer.
527 
528      |                  *-|
529      |               *----|   First, we fill the buffer with keys. Key_buffer
530      |            *-------|   part grows from end of the buffer space to start
531      |         *----------|   (In this picture, the buffer is big enough to
532      |      *-------------|    accomodate all keys and even have some space left)
533 
534      |      *=============|   We want to do key-ordered index scan, so we sort
535                               the keys
536 
537      |-x      *===========|   Then we use the keys get rowids. Rowids are
538      |----x      *========|   stored from start of buffer space towards the end.
539      |--------x     *=====|   The part of the buffer occupied with keys
540      |------------x   *===|   gradually frees up space for rowids. In this
541      |--------------x   *=|   picture we run out of keys before we've ran out
542      |----------------x   |   of buffer space (it can be other way as well).
543 
544      |================x   |   Then we sort the rowids.
545 
546      |                |~~~|   The unused part of the buffer is at the end, so
547                               we can return it to the SQL layer.
548 
549      |================*       Sorted rowids are then used to read table records
550                               in disk order
551 
552 */
553 
554 class DsMrr_impl
555 {
556 public:
557   typedef void (handler::*range_check_toggle_func_t)(bool on);
558 
DsMrr_impl()559   DsMrr_impl()
560     : secondary_file(NULL),
561       rowid_filter(NULL) {};
562 
init(handler * h_arg,TABLE * table_arg)563   void init(handler *h_arg, TABLE *table_arg)
564   {
565     primary_file= h_arg;
566     table= table_arg;
567   }
568   int dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
569                  void *seq_init_param, uint n_ranges, uint mode,
570                  HANDLER_BUFFER *buf);
571   void dsmrr_close();
572   int dsmrr_next(range_id_t *range_info);
573 
574   ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts,
575                      uint *bufsz, uint *flags, Cost_estimate *cost);
576 
577   ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
578                             void *seq_init_param, uint n_ranges, uint *bufsz,
579                             uint *flags, Cost_estimate *cost);
580 
581   int dsmrr_explain_info(uint mrr_mode, char *str, size_t size);
582 private:
583   /* Buffer to store (key, range_id) pairs */
584   Lifo_buffer *key_buffer;
585 
586   /*
587     The "owner" handler object (the one that is expected to "own" this object
588     and call its functions).
589   */
590   handler *primary_file;
591   TABLE *table; /* Always equal to primary_file->table */
592 
593   /*
594     Secondary handler object. (created when needed, we need it when we need
595     to run both index scan and rnd_pos() scan at the same time)
596   */
597   handler *secondary_file;
598 
599   /*
600     The rowid filter that DS-MRR has "unpushed" from the storage engine.
601     If it's present, DS-MRR will use it.
602   */
603   Rowid_filter *rowid_filter;
604 
605   uint keyno; /* index we're running the scan on */
606   /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
607   bool is_mrr_assoc;
608 
609   Mrr_reader_factory reader_factory;
610 
611   Mrr_reader *strategy;
612   bool strategy_exhausted;
613 
614   Mrr_index_reader *index_strategy;
615 
616   /* The whole buffer space that we're using */
617   uchar *full_buf;
618   uchar *full_buf_end;
619 
620   /*
621     When using both rowid and key buffers: the boundary between key and rowid
622     parts of the buffer. This is the "original" value, actual memory ranges
623     used by key and rowid parts may be different because of dynamic space
624     reallocation between them.
625   */
626   uchar *rowid_buffer_end;
627 
628   /*
629     One of the following two is used for key buffer: forward is used when
630     we only need key buffer, backward is used when we need both key and rowid
631     buffers.
632   */
633   Forward_lifo_buffer forward_key_buf;
634   Backward_lifo_buffer backward_key_buf;
635 
636   /*
637     Buffer to store (rowid, range_id) pairs, or just rowids if
638     is_mrr_assoc==FALSE
639   */
640   Forward_lifo_buffer rowid_buffer;
641 
642   bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz,
643                        Cost_estimate *cost);
644   bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
645                                uint *buffer_size, uint extra_mem_overhead,
646                                Cost_estimate *cost);
647   bool check_cpk_scan(THD *thd, TABLE_SHARE *share, uint keyno, uint mrr_flags);
648 
649   bool setup_buffer_sharing(uint key_size_in_keybuf, key_part_map key_tuple_map);
650 
651   /* Buffer_manager and its member functions */
652   Buffer_manager buf_manager;
653   static void redistribute_buffer_space(void *dsmrr_arg);
654   static void reset_buffer_sizes(void *dsmrr_arg);
655   static void do_nothing(void *dsmrr_arg);
656 
get_key_buffer()657   Lifo_buffer* get_key_buffer() { return key_buffer; }
658 
659   friend class Key_value_records_iterator;
660   friend class Mrr_ordered_index_reader;
661   friend class Mrr_ordered_rndpos_reader;
662 
663   int  setup_two_handlers();
664   void close_second_handler();
665 };
666 
667 /**
668   @} (end of group DS-MRR declarations)
669 */
670 
671